\documentclass[11pt,twoside]{article}\makeatletter

\IfFileExists{xcolor.sty}%
  {\RequirePackage{xcolor}}%
  {\RequirePackage{color}}
\usepackage{colortbl}
\usepackage{wrapfig}
\usepackage{ifxetex}
\ifxetex
  \usepackage{fontspec}
  \usepackage{xunicode}
  \catcode`⃥=\active \def⃥{\textbackslash}
  \catcode`❴=\active \def❴{\{}
  \catcode`❵=\active \def❵{\}}
  \def\textJapanese{\fontspec{Noto Sans CJK JP}}
  \def\textChinese{\fontspec{Noto Sans CJK SC}}
  \def\textKorean{\fontspec{Noto Sans CJK KR}}
  \setmonofont{DejaVu Sans Mono}
  
\else
  \IfFileExists{utf8x.def}%
   {\usepackage[utf8x]{inputenc}
      \PrerenderUnicode{–}
    }%
   {\usepackage[utf8]{inputenc}}
  \usepackage[english]{babel}
  \usepackage[T1]{fontenc}
  \usepackage{float}
  \usepackage[]{ucs}
  \uc@dclc{8421}{default}{\textbackslash }
  \uc@dclc{10100}{default}{\{}
  \uc@dclc{10101}{default}{\}}
  \uc@dclc{8491}{default}{\AA{}}
  \uc@dclc{8239}{default}{\,}
  \uc@dclc{20154}{default}{ }
  \uc@dclc{10148}{default}{>}
  \def\textschwa{\rotatebox{-90}{e}}
  \def\textJapanese{}
  \def\textChinese{}
  \IfFileExists{tipa.sty}{\usepackage{tipa}}{}
\fi
\def\exampleFont{\ttfamily\small}
\DeclareTextSymbol{\textpi}{OML}{25}
\usepackage{relsize}
\RequirePackage{array}
\def\@testpach{\@chclass
 \ifnum \@lastchclass=6 \@ne \@chnum \@ne \else
  \ifnum \@lastchclass=7 5 \else
   \ifnum \@lastchclass=8 \tw@ \else
    \ifnum \@lastchclass=9 \thr@@
   \else \z@
   \ifnum \@lastchclass = 10 \else
   \edef\@nextchar{\expandafter\string\@nextchar}%
   \@chnum
   \if \@nextchar c\z@ \else
    \if \@nextchar l\@ne \else
     \if \@nextchar r\tw@ \else
   \z@ \@chclass
   \if\@nextchar |\@ne \else
    \if \@nextchar !6 \else
     \if \@nextchar @7 \else
      \if \@nextchar (8 \else
       \if \@nextchar )9 \else
  10
  \@chnum
  \if \@nextchar m\thr@@\else
   \if \@nextchar p4 \else
    \if \@nextchar b5 \else
   \z@ \@chclass \z@ \@preamerr \z@ \fi \fi \fi \fi
   \fi \fi  \fi  \fi  \fi  \fi  \fi \fi \fi \fi \fi \fi}
\gdef\arraybackslash{\let\\=\@arraycr}
\def\@textsubscript#1{{\m@th\ensuremath{_{\mbox{\fontsize\sf@size\z@#1}}}}}
\def\Panel#1#2#3#4{\multicolumn{#3}{){\columncolor{#2}}#4}{#1}}
\def\abbr{}
\def\corr{}
\def\expan{}
\def\gap{}
\def\orig{}
\def\reg{}
\def\ref{}
\def\sic{}
\def\persName{}\def\name{}
\def\placeName{}
\def\orgName{}
\def\textcal#1{{\fontspec{Lucida Calligraphy}#1}}
\def\textgothic#1{{\fontspec{Lucida Blackletter}#1}}
\def\textlarge#1{{\large #1}}
\def\textoverbar#1{\ensuremath{\overline{#1}}}
\def\textquoted#1{‘#1’}
\def\textsmall#1{{\small #1}}
\def\textsubscript#1{\@textsubscript{\selectfont#1}}
\def\textxi{\ensuremath{\xi}}
\def\titlem{\itshape}
\newenvironment{biblfree}{}{\ifvmode\par\fi }
\newenvironment{bibl}{}{}
\newenvironment{byline}{\vskip6pt\itshape\fontsize{16pt}{18pt}\selectfont}{\par }
\newenvironment{citbibl}{}{\ifvmode\par\fi }
\newenvironment{docAuthor}{\ifvmode\vskip4pt\fontsize{16pt}{18pt}\selectfont\fi\itshape}{\ifvmode\par\fi }
\newenvironment{docDate}{}{\ifvmode\par\fi }
\newenvironment{docImprint}{\vskip 6pt}{\ifvmode\par\fi }
\newenvironment{docTitle}{\vskip6pt\bfseries\fontsize{22pt}{25pt}\selectfont}{\par }
\newenvironment{msHead}{\vskip 6pt}{\par}
\newenvironment{msItem}{\vskip 6pt}{\par}
\newenvironment{rubric}{}{}
\newenvironment{titlePart}{}{\par }

\newcolumntype{L}[1]{){\raggedright\arraybackslash}p{#1}}
\newcolumntype{C}[1]{){\centering\arraybackslash}p{#1}}
\newcolumntype{R}[1]{){\raggedleft\arraybackslash}p{#1}}
\newcolumntype{P}[1]{){\arraybackslash}p{#1}}
\newcolumntype{B}[1]{){\arraybackslash}b{#1}}
\newcolumntype{M}[1]{){\arraybackslash}m{#1}}
\definecolor{label}{gray}{0.75}
\def\unusedattribute#1{\sout{\textcolor{label}{#1}}}
\DeclareRobustCommand*{\xref}{\hyper@normalise\xref@}
\def\xref@#1#2{\hyper@linkurl{#2}{#1}}
\begingroup
\catcode`\_=\active
\gdef_#1{\ensuremath{\sb{\mathrm{#1}}}}
\endgroup
\mathcode`\_=\string"8000
\catcode`\_=12\relax

\usepackage[a4paper,twoside,lmargin=1in,rmargin=1in,tmargin=1in,bmargin=1in,marginparwidth=0.75in]{geometry}
\usepackage{framed}

\definecolor{shadecolor}{gray}{0.95}
\usepackage{longtable}
\usepackage[normalem]{ulem}
\usepackage{fancyvrb}
\usepackage{fancyhdr}
\usepackage{graphicx}
\usepackage{marginnote}

\renewcommand{\@cite}[1]{#1}


\renewcommand*{\marginfont}{\itshape\footnotesize}

\def\Gin@extensions{.pdf,.png,.jpg,.mps,.tif}

  \pagestyle{fancy}

\usepackage[pdftitle={Bayesian Spam Filtering Using Statistical Data Compression},
 pdfauthor={}]{hyperref}
\hyperbaseurl{}

	 \paperwidth210mm
	 \paperheight297mm
              
\def\@pnumwidth{1.55em}
\def\@tocrmarg {2.55em}
\def\@dotsep{4.5}
\setcounter{tocdepth}{3}
\clubpenalty=8000
\emergencystretch 3em
\hbadness=4000
\hyphenpenalty=400
\pretolerance=750
\tolerance=2000
\vbadness=4000
\widowpenalty=10000

\renewcommand\section{\@startsection {section}{1}{\z@}%
     {-1.75ex \@plus -0.5ex \@minus -.2ex}%
     {0.5ex \@plus .2ex}%
     {\reset@font\Large\bfseries}}
\renewcommand\subsection{\@startsection{subsection}{2}{\z@}%
     {-1.75ex\@plus -0.5ex \@minus- .2ex}%
     {0.5ex \@plus .2ex}%
     {\reset@font\Large}}
\renewcommand\subsubsection{\@startsection{subsubsection}{3}{\z@}%
     {-1.5ex\@plus -0.35ex \@minus -.2ex}%
     {0.5ex \@plus .2ex}%
     {\reset@font\large}}
\renewcommand\paragraph{\@startsection{paragraph}{4}{\z@}%
     {-1ex \@plus-0.35ex \@minus -0.2ex}%
     {0.5ex \@plus .2ex}%
     {\reset@font\normalsize}}
\renewcommand\subparagraph{\@startsection{subparagraph}{5}{\parindent}%
     {1.5ex \@plus1ex \@minus .2ex}%
     {-1em}%
     {\reset@font\normalsize\bfseries}}


\def\l@section#1#2{\addpenalty{\@secpenalty} \addvspace{1.0em plus 1pt}
 \@tempdima 1.5em \begingroup
 \parindent \z@ \rightskip \@pnumwidth 
 \parfillskip -\@pnumwidth 
 \bfseries \leavevmode #1\hfil \hbox to\@pnumwidth{\hss #2}\par
 \endgroup}
\def\l@subsection{\@dottedtocline{2}{1.5em}{2.3em}}
\def\l@subsubsection{\@dottedtocline{3}{3.8em}{3.2em}}
\def\l@paragraph{\@dottedtocline{4}{7.0em}{4.1em}}
\def\l@subparagraph{\@dottedtocline{5}{10em}{5em}}
\@ifundefined{c@section}{\newcounter{section}}{}
\@ifundefined{c@chapter}{\newcounter{chapter}}{}
\newif\if@mainmatter 
\@mainmattertrue
\def\chaptername{Chapter}
\def\frontmatter{%
  \pagenumbering{roman}
  \def\thechapter{\@roman\c@chapter}
  \def\theHchapter{\roman{chapter}}
  \def\thesection{\@roman\c@section}
  \def\theHsection{\roman{section}}
  \def\@chapapp{}%
}
\def\mainmatter{%
  \cleardoublepage
  \def\thechapter{\@arabic\c@chapter}
  \setcounter{chapter}{0}
  \setcounter{section}{0}
  \pagenumbering{arabic}
  \setcounter{secnumdepth}{6}
  \def\@chapapp{\chaptername}%
  \def\theHchapter{\arabic{chapter}}
  \def\thesection{\@arabic\c@section}
  \def\theHsection{\arabic{section}}
}
\def\backmatter{%
  \cleardoublepage
  \setcounter{chapter}{0}
  \setcounter{section}{0}
  \setcounter{secnumdepth}{2}
  \def\@chapapp{\appendixname}%
  \def\thechapter{\@Alph\c@chapter}
  \def\theHchapter{\Alph{chapter}}
  \appendix
}
\newenvironment{bibitemlist}[1]{%
   \list{\@biblabel{\@arabic\c@enumiv}}%
       {\settowidth\labelwidth{\@biblabel{#1}}%
        \leftmargin\labelwidth
        \advance\leftmargin\labelsep
        \@openbib@code
        \usecounter{enumiv}%
        \let\p@enumiv\@empty
        \renewcommand\theenumiv{\@arabic\c@enumiv}%
	}%
  \sloppy
  \clubpenalty4000
  \@clubpenalty \clubpenalty
  \widowpenalty4000%
  \sfcode`\.\@m}%
  {\def\@noitemerr
    {\@latex@warning{Empty `bibitemlist' environment}}%
    \endlist}

\def\tableofcontents{\section*{\contentsname}\@starttoc{toc}}
\parskip0pt
\parindent1em
\def\Panel#1#2#3#4{\multicolumn{#3}{){\columncolor{#2}}#4}{#1}}
\newenvironment{reflist}{%
  \begin{raggedright}\begin{list}{}
  {%
   \setlength{\topsep}{0pt}%
   \setlength{\rightmargin}{0.25in}%
   \setlength{\itemsep}{0pt}%
   \setlength{\itemindent}{0pt}%
   \setlength{\parskip}{0pt}%
   \setlength{\parsep}{2pt}%
   \def\makelabel##1{\itshape ##1}}%
  }
  {\end{list}\end{raggedright}}
\newenvironment{sansreflist}{%
  \begin{raggedright}\begin{list}{}
  {%
   \setlength{\topsep}{0pt}%
   \setlength{\rightmargin}{0.25in}%
   \setlength{\itemindent}{0pt}%
   \setlength{\parskip}{0pt}%
   \setlength{\itemsep}{0pt}%
   \setlength{\parsep}{2pt}%
   \def\makelabel##1{\upshape ##1}}%
  }
  {\end{list}\end{raggedright}}
\newenvironment{specHead}[2]%
 {\vspace{20pt}\hrule\vspace{10pt}%
  \phantomsection\label{#1}\markright{#2}%

  \pdfbookmark[2]{#2}{#1}%
  \hspace{-0.75in}{\bfseries\fontsize{16pt}{18pt}\selectfont#2}%
  }{}
      \def\TheFullDate{2011-12-07 (revised: 07 December 2011)}
\def\TheID{\makeatother }
\def\TheDate{2011-12-07}
\title{Bayesian Spam Filtering Using Statistical Data Compression}
\author{}\makeatletter 
\makeatletter
\newcommand*{\cleartoleftpage}{%
  \clearpage
    \if@twoside
    \ifodd\c@page
      \hbox{}\newpage
      \if@twocolumn
        \hbox{}\newpage
      \fi
    \fi
  \fi
}
\makeatother
\makeatletter
\thispagestyle{empty}
\markright{\@title}\markboth{\@title}{\@author}
\renewcommand\small{\@setfontsize\small{9pt}{11pt}\abovedisplayskip 8.5\p@ plus3\p@ minus4\p@
\belowdisplayskip \abovedisplayskip
\abovedisplayshortskip \z@ plus2\p@
\belowdisplayshortskip 4\p@ plus2\p@ minus2\p@
\def\@listi{\leftmargin\leftmargini
               \topsep 2\p@ plus1\p@ minus1\p@
               \parsep 2\p@ plus\p@ minus\p@
               \itemsep 1pt}
}
\makeatother
\fvset{frame=single,numberblanklines=false,xleftmargin=5mm,xrightmargin=5mm}
\fancyhf{} 
\setlength{\headheight}{14pt}
\fancyhead[LE]{\bfseries\leftmark} 
\fancyhead[RO]{\bfseries\rightmark} 
\fancyfoot[RO]{}
\fancyfoot[CO]{\thepage}
\fancyfoot[LO]{\TheID}
\fancyfoot[LE]{}
\fancyfoot[CE]{\thepage}
\fancyfoot[RE]{\TheID}
\hypersetup{citebordercolor=0.75 0.75 0.75,linkbordercolor=0.75 0.75 0.75,urlbordercolor=0.75 0.75 0.75,bookmarksnumbered=true}
\fancypagestyle{plain}{\fancyhead{}\renewcommand{\headrulewidth}{0pt}}

\date{}
\usepackage{authblk}

\providecommand{\keywords}[1]
{
\footnotesize
  \textbf{\textit{Index terms---}} #1
}

\usepackage{graphicx,xcolor}
\definecolor{GJBlue}{HTML}{273B81}
\definecolor{GJLightBlue}{HTML}{0A9DD9}
\definecolor{GJMediumGrey}{HTML}{6D6E70}
\definecolor{GJLightGrey}{HTML}{929497} 

\renewenvironment{abstract}{%
   \setlength{\parindent}{0pt}\raggedright
   \textcolor{GJMediumGrey}{\rule{\textwidth}{2pt}}
   \vskip16pt
   \textcolor{GJBlue}{\large\bfseries\abstractname\space}
}{%   
   \vskip8pt
   \textcolor{GJMediumGrey}{\rule{\textwidth}{2pt}}
   \vskip16pt
}

\usepackage[absolute,overlay]{textpos}

\makeatother 
      \usepackage{lineno}
      \linenumbers
      
\begin{document}

             \author[1]{Dr. GUMPINA V V SATYA  PRASAD}

             \affil[1]{  Sir. C. R. REDDY COLLEGE OF ENGG, ELURU}

\renewcommand\Authands{ and }

\date{\small \em Received: 27 October 2011 Accepted: 22 November 2011 Published: 7 December 2011}

\maketitle


\begin{abstract}
        

The Spam e-mail has become a major problem for companies and private users. This  paper associated with spam and some different approaches attempting to deal with it. The most  appealing methods are those that are easy to maintain and prove to have a satisfactory  performance. Statistical classifiers are such a group of methods as their ability to filter spam is  based upon the previous knowledge gathered through collected and classified e-mails. A  learning algorithm which uses   the Naive Bayesian classifier has shown promising results in  separating spam from legitimate mail.      

\end{abstract}


\keywords{approaches, classified, Statistical}

\begin{textblock*}{18cm}(1cm,1cm) % {block width} (coords) 
\textcolor{GJBlue}{\LARGE Global Journals \LaTeX\ JournalKaleidoscope\texttrademark}
\end{textblock*}

\begin{textblock*}{18cm}(1.4cm,1.5cm) % {block width} (coords) 
\textcolor{GJBlue}{\footnotesize \\ Artificial Intelligence formulated this projection for compatibility purposes from the original article published at Global Journals. However, this technology is currently in beta. \emph{Therefore, kindly ignore odd layouts, missed formulae, text, tables, or figures.}}
\end{textblock*}


\let\tabcellsep& 	 	 		 
\section[{INTRODUCTION}]{INTRODUCTION}\par
pam has become a serious problem because in the short term it is usually economically beneficial to the sender. The low cost of e-mail as a communication medium virtually guaranties profits. Even if a very small percentage of people respond to the spam advertising message by buying the product, this can be worth the money and the time spent for sending bulk e-mails. Commercial spammers are often represented by people or companies that have no reputation to lose. Because of technological obstacles with e-mail infrastructure, it is difficult and timeconsuming to trace the individual or the group responsible for sending spam. Spammers make it even more difficult by hiding or forging the origin of their messages. Even if they are traced, the decentralized architecture of the Internet with no central authority makes it hard to take legal actions against spammers. The statistical filtering (especially Bayesian filtering) has long been a popular anti-spam approach, but spam continues to be a serious problem to the Internet society. Recent spam attacks expose strong challenges to the statistical filters, which highlights the need for a new anti-spam approach. The economics of spam dictates that the spammer has to target several recipients with identical or similar e-mail messages. This makes collaborative spam filtering a natural defense paradigm, wherein a set of e-mail clients share their knowledge about recently receivedspame-mails, providing a highly effective defense against a substantial fraction of spam attacks. Also, knowledge sharing can significantly alleviate the burdens of frequent training stand-alone spam filters. However, any large-scale Author ? : M.Tech, Asst. Professor, IT Dept, Sir. C. R. Reddy College of Engg, Eluru, A.P. India. E-mail : prasad17gumpina@gmail.com Author ? : M.Tech, Asst. Professor, CSE Dept, Avanthi Institute of Engineering and Technology, Makavarapalem, Visakhapatnam. E-mail : balasriram1982@gmail.com collaborative anti-spam approach is faced with a fundamental and important challenge, namely ensuring the privacy of the e-mails among un trusted e-mail entities. Different from the e-mail service providers such as Gmail or Yahoo mail, which utilizes spam or ham(non-spam) classifications from all its users to classify new messages, privacy is a major concern for cross-enterprise collaboration, especially in a large scale. The idea of collaboration implies that the participating users and e-mail servers have to share and exchange information about the e-mails (including the classification result). However, e-mails are generally considered as private communication between the senders and the recipients, and they often contain personal and confidential information. Therefore, users and organizations are not comfortable sharing information about their e-mails until and unless they are assured that no one else (human or machine) would become aware of the actual contents of their e-mails. This genuine concern for privacy has deterred users and organizations from participating in any large-scale collaborative spam filtering effort. To protect e-mail privacy, digest approach has been proposed in the collaborative anti-spam systems to both provide encryption for the e-mail messages and obtain useful information (fingerprint) from spam e-mail. Ideally, the digest calculation has to be a one-way function such that it should be computationally hard to generate the corresponding e-mail message. It should embody the textual features of the e-mail message such that if two emails have similar syntactic structure, then their fingerprints should also be similar.Afew distributed spam identification schemes, such as Distributed Checksum Clearinghouse (DCC) \hyperref[b1]{[2]} and Vipul's Razor [3] have different ways to generate fingerprints. However, these systems are not sufficient to handle two security threats: 1) Privacy breach as discussed in detail in Section 2 and 2) Camouflage attacks, such as character replacement and good word appendant, make it hard to generate the same e-mail fingerprints for highly similar spam e-mails. 
\section[{II.}]{II.} 
\section[{STATISTICAL DATA COMPRESSION}]{STATISTICAL DATA COMPRESSION}\par
Probability plays a central role in data compression: Knowing the exact probability distribution governing an information source allows us to construct optimal or near-optimal codes for messages produced by the source. A statistical data compression algorithm exploits this relationship by building a statistical model of the information source, which can be used to estimate the probability of each possible message. This model is coupled with an encoder that uses these probability estimates to construct the final binary representation. For our purposes, the encoding problem is irrelevant. We therefore focus on the source modeling task III. 
\section[{PRELIMINARIES}]{PRELIMINARIES}\par
We denote by X the random variable associated with the source, which may take the value of any message the source is capable of producing, and by P the probability distribution over the values of X with the corresponding probability mass function p. We are particularly interested in modeling of text generating sources. Each message x produced by such a source is naturally represented as a sequence X=x1 n = x1?.xn ?*of symbols over the source alphabet ?. The length of a sequence can be arbitrary. For text generating sources, it is common to interpret a symbol as a single character, but other schemes are possible, such as binary (bitwise) or word-level models. The entropy H (X) of a source X gives a lower bound on the average persymbol code length required to encode a message without loss of information: H(x)= This bound is achievable only when the true probability distribution P governing the source is known. In this case, an average message could be encoded using no less than H(X) bits per symbol. However, the true distribution over all possible messages is typically unknown. The goal of any statistical data compression algorithm is then to infer a probability mass function over sequences which matches the true distribution of the source as accurately as possible.\par
Ideally2, a sequence x is then encoded with L(x) bits, where L(x) = -log f (x). The compression algorithm must therefore learn an approximation of P in order to encode messages efficiently. A better approximation will, on average, lead to shorter code lengths. This simple observation alone gives compelling motivation for the use of compression algorithms in text categorization.\par
IV. 
\section[{BAYESIAN SPAM FILTERING}]{BAYESIAN SPAM FILTERING}\par
Bayesian spam filtering can be conceptualized into the model presented in Figure  {\ref 1}. It consists of four major modules, each responsible for four different processes: message tokenization, probability estimation, feature selection and Naive Bayesian classification.\par
When a message arrives, it is firstly tokenized into a set of features (tokens), F . Every feature is assigned an estimated probability that indicates its spaminess. To reduce the dimensionality of the feature vector, a feature selection algorithm is applied to output a subset of the features. The Naive Bayesian classifier combines the probabilities of every feature in 1 F , and estimates the probability of the message being spam. In the following text, the process of Naive Bayesian classification is described, followed by details concerning the measuring performance. This order of explanation is necessary because the sections concerned with the first three modules require understanding of the classification process and the parameters used to evaluate its improvement.\par
V. 
\section[{PERFORMANCE EVOLUTION}]{PERFORMANCE EVOLUTION}\par
Precision and recall a well employed metric for performance measurement in information retrieval is precision and recall. These measures have been diligently used in the context of spam classification (Sahami et al.1998). Recall is the proportion of relevant items that are retrieved, which in this case is the proportion of spam messages that are actually recognized. For example if 9 out of 10 spam messages are correctly identified as spam, the recall rate is 0.9. Precision is defined as the proportion of items retrieved that are relevant. In the spam classification context, precision is the proportion of the spam messages classified as spam over the total number of messages classified as spam. Thus if only spam messages are classified as spam then the precision is 1. As soon as a good legitimate message is classified as spam, the precision will drop below 1. Formally: Let gg n be the number of good messages classified as good (also known as false negatives). Let gs n be the number of good messages classified as spam (also known as false positives). \hyperref[b8]{(9)}. Let ss n be the number of spam messages classified as spam (also known as true positives). Let sg n be the number of spam messages classified as good (also known as true negatives). The precision calculates the occurrence of false positives which are good messages classified as spam. When this happens p drops below 1. Such misclassification could be a disaster for the user whereas the only impact of a low recall rate is to receive spam messages in the inbox. Hence it is more important for the precision to be at a high level than the recall rate. The precision and recall reveal little unless used together. Commercial spam filters sometimes claim that they have an incredibly high precision value of 0.9999\% without mentioning the related recall rate. This can appear to be very good to the untrained eye. A reasonably good spam classifier should have precision very close to 1 and a recall rate > 0.8. A problem when evaluating classifiers is to find a good balance between the precision and recall rates. Therefore it is necessary to use a strategy to obtain a combined score. One way to achieve this is to use weighted accuracy.\par
VI. 
\section[{CROSS VALIDATION}]{CROSS VALIDATION}\par
There are several means of estimating how well the classifier works after training. The easiest and most straightforward means is by splitting the corpus into two parts and using one part for training and the other for testing. This is called the holdout method. The disadvantage is that the evaluation depends heavily on which samples end up in which set. Another method that reduces the variance of the holdout method is kfold cross-validation. In k -fold cross-validation (Kohavi 1995) the corpus, M , is split into k mutually exclusive parts, k M ,M ,...M 1 2 . The inducer is trained on i M \textbackslash  M and tested against i M . This is repeated k times with different i such that iÎ\{1,2,...,k\}. Finally the performance is estimated as the mean of the total number of tests. 
\section[{VII.}]{VII.} 
\section[{CONCLUSION}]{CONCLUSION}\par
Optimal search algorithm called SFFS was applied to find a subset of delimiters for the tokenizer. Then a filter and a wrapper algorithm were proposed to determine how beneficial a group of delimiters is to the classification task. The filter approach ran about ten times faster than the wrapper, but did not produce significantly better subsets than the base-lines. The wrapper did improve the performance on all corpuses by finding small subsets of delimiters. This suggested an idea concerning how to select delimiters for a nearoptimal solution, namely to start with space and then add a few more. Since the wrapper generated subsets had nothing in common apart from space, the recommendation is to only use space as a delimiter. The wrapper was far too slow to use in spam filter.			\footnote{S© 2011 Global Journals Inc. (US)} 			\footnote{( I ) 2011 December} 			\footnote{© 2011 Global Journals Inc. (US)} 			\footnote{Global I ) 2011 December (} 			\footnote{Global I ) 2011 December ( Bayesian Spam Filtering Using Statistical Data Compression} 		 		\backmatter  			  				\begin{bibitemlist}{1}
\bibitem[Androutsopoulos et al. ()]{b2}\label{b2} 	 		\textit{},  		 			I Androutsopoulos 		,  		 			J Koutsias 		,  		 			K V Chandrinos 		,  		 			George Paliouras 		,  		 			C D Spyropoulos 		.  		2000b.  	 
\bibitem[11th European Conference on Machine Learning ()]{b4}\label{b4} 	 		\textit{11th European Conference on Machine Learning},  				 (Barcelona, Spain)  		2000. p. .  	 
\bibitem[Potamias Moustakis Van Someren]{b3}\label{b3} 	 		‘An Evaluation of Naive Bayesian Anti-Spam Filtering’.  	 	 		\textit{Proceedings of the Workshop on Machine Learning in the New},  				 			G Potamias,  			V Moustakis,  			M Van Someren 		 (ed.)  		 (the Workshop on Machine Learning in the New)  		 	 
\bibitem[Bratko and Filipi]{b11}\label{b11} 	 		 			A Bratko 		,  		 			B Filipi 		.  		\textit{Spam filtering using character-level markov models: Experiments for the TREC 2005 Spam Track},  				 	 
\bibitem[Assis et al. (2005)]{b8}\label{b8} 	 		‘CRM114 versus Mr. X: CRM114 notes for the TREC 2005 spam track’.  		 			F Assis 		,  		 			W Yerazunis 		,  		 			C Siefkes 		,  		 			S Chhabra 		.  	 	 		\textit{Proc. 14th Text REtrieval Conference (TREC 2005)},  				 (14th Text REtrieval Conference (TREC 2005)Gaithersburg, MD)  		November 2005.  	 
\bibitem[December Bayesian Spam Filtering Using Statistical Data Compression and Loreto V. Language trees and zipping Physical Review Letters ()]{b10}\label{b10} 	 		‘December Bayesian Spam Filtering Using Statistical Data Compression and Loreto V. Language trees and zipping’.  	 	 		\textit{Physical Review Letters}  		2002. 88  (4) .  	 
\bibitem[Androutsopoulos et al. ()]{b1}\label{b1} 	 		‘Learning to filter spam email: A comparison of a naive bayesian and a memory-based approach’.  		 			I Androutsopoulos 		,  		 			G Paliouras 		,  		 			V Karkaletsis 		,  		 			G Sakkis 		,  		 			C Spyropoulos 		,  		 			P Stamatopoulos 		.  	 	 		\textit{Workshop on Machine Learning and Textual Information Access, 4th European 3. Conference on Principles and Practice of Knowledge Discovery in Databases},  				2000a. 2000.  	 
\bibitem[Androutsopoulos et al. ()]{b5}\label{b5} 	 		\textit{Learning to Filter Unsolicited Commercial E-Mail},  		 			I Androutsopoulos 		,  		 			G Paliouras 		,  		 			E Michelakis 		.  		2004. 2003.  		 			Athens University of Economics and Business and National Centre for Scientific Research "Demokritos" Bevilacqua-Linn M. 		 	 
\bibitem[Androutsopoulos et al. (2004)]{b7}\label{b7} 	 		‘Learning to filter unsolicited commercial e-mail’.  		 			G Androutsopoulos 		,  		 			E Paliouras 		,  		 			Michelakis 		.  	 	 		\textit{NCSR "Demokritos},  				2004. October 2004.  	 	 (Technical Report) 
\bibitem[Almuallim and Dietterich ()]{b0}\label{b0} 	 		‘Learning with many irrelevant features’.  		 			H Almuallim 		,  		 			T Dietterich 		.  	 	 		\textit{Proceedings of the Ninth National Conference on Artificial Intelligence},  				 (the Ninth National Conference on Artificial IntelligenceMenlo Park, CA)  		1991. AAAI Press/The MIT Press. p. .  	 
\bibitem[Spector ()]{b6}\label{b6} 	 		‘Submodel selection and evaluation in regression: The Xrandom case’.  		 			L Spector 		,  		 			P 		.  	 	 		\textit{Machine Learning for Naive Bayesian Spam Filter Tokenization Breiman},  				1992. 60 p. .  	 
\bibitem[Barron et al. ()]{b9}\label{b9} 	 		‘The minimum description length principle in coding and modeling’.  		 			A R Barron 		,  		 			J Rissanen 		,  		 			B Yu 		.  	 	 		\textit{IEEE Transactions on Information Theory}  		D. Benedetto, E. Caglioti (ed.)  		1998. 44  (6)  p. 2011.  	 
\end{bibitemlist}
 			 		 	 
\end{document}