\documentclass[11pt,twoside]{article}\makeatletter

\IfFileExists{xcolor.sty}%
  {\RequirePackage{xcolor}}%
  {\RequirePackage{color}}
\usepackage{colortbl}
\usepackage{wrapfig}
\usepackage{ifxetex}
\ifxetex
  \usepackage{fontspec}
  \usepackage{xunicode}
  \catcode`⃥=\active \def⃥{\textbackslash}
  \catcode`❴=\active \def❴{\{}
  \catcode`❵=\active \def❵{\}}
  \def\textJapanese{\fontspec{Noto Sans CJK JP}}
  \def\textChinese{\fontspec{Noto Sans CJK SC}}
  \def\textKorean{\fontspec{Noto Sans CJK KR}}
  \setmonofont{DejaVu Sans Mono}
  
\else
  \IfFileExists{utf8x.def}%
   {\usepackage[utf8x]{inputenc}
      \PrerenderUnicode{–}
    }%
   {\usepackage[utf8]{inputenc}}
  \usepackage[english]{babel}
  \usepackage[T1]{fontenc}
  \usepackage{float}
  \usepackage[]{ucs}
  \uc@dclc{8421}{default}{\textbackslash }
  \uc@dclc{10100}{default}{\{}
  \uc@dclc{10101}{default}{\}}
  \uc@dclc{8491}{default}{\AA{}}
  \uc@dclc{8239}{default}{\,}
  \uc@dclc{20154}{default}{ }
  \uc@dclc{10148}{default}{>}
  \def\textschwa{\rotatebox{-90}{e}}
  \def\textJapanese{}
  \def\textChinese{}
  \IfFileExists{tipa.sty}{\usepackage{tipa}}{}
\fi
\def\exampleFont{\ttfamily\small}
\DeclareTextSymbol{\textpi}{OML}{25}
\usepackage{relsize}
\RequirePackage{array}
\def\@testpach{\@chclass
 \ifnum \@lastchclass=6 \@ne \@chnum \@ne \else
  \ifnum \@lastchclass=7 5 \else
   \ifnum \@lastchclass=8 \tw@ \else
    \ifnum \@lastchclass=9 \thr@@
   \else \z@
   \ifnum \@lastchclass = 10 \else
   \edef\@nextchar{\expandafter\string\@nextchar}%
   \@chnum
   \if \@nextchar c\z@ \else
    \if \@nextchar l\@ne \else
     \if \@nextchar r\tw@ \else
   \z@ \@chclass
   \if\@nextchar |\@ne \else
    \if \@nextchar !6 \else
     \if \@nextchar @7 \else
      \if \@nextchar (8 \else
       \if \@nextchar )9 \else
  10
  \@chnum
  \if \@nextchar m\thr@@\else
   \if \@nextchar p4 \else
    \if \@nextchar b5 \else
   \z@ \@chclass \z@ \@preamerr \z@ \fi \fi \fi \fi
   \fi \fi  \fi  \fi  \fi  \fi  \fi \fi \fi \fi \fi \fi}
\gdef\arraybackslash{\let\\=\@arraycr}
\def\@textsubscript#1{{\m@th\ensuremath{_{\mbox{\fontsize\sf@size\z@#1}}}}}
\def\Panel#1#2#3#4{\multicolumn{#3}{){\columncolor{#2}}#4}{#1}}
\def\abbr{}
\def\corr{}
\def\expan{}
\def\gap{}
\def\orig{}
\def\reg{}
\def\ref{}
\def\sic{}
\def\persName{}\def\name{}
\def\placeName{}
\def\orgName{}
\def\textcal#1{{\fontspec{Lucida Calligraphy}#1}}
\def\textgothic#1{{\fontspec{Lucida Blackletter}#1}}
\def\textlarge#1{{\large #1}}
\def\textoverbar#1{\ensuremath{\overline{#1}}}
\def\textquoted#1{‘#1’}
\def\textsmall#1{{\small #1}}
\def\textsubscript#1{\@textsubscript{\selectfont#1}}
\def\textxi{\ensuremath{\xi}}
\def\titlem{\itshape}
\newenvironment{biblfree}{}{\ifvmode\par\fi }
\newenvironment{bibl}{}{}
\newenvironment{byline}{\vskip6pt\itshape\fontsize{16pt}{18pt}\selectfont}{\par }
\newenvironment{citbibl}{}{\ifvmode\par\fi }
\newenvironment{docAuthor}{\ifvmode\vskip4pt\fontsize{16pt}{18pt}\selectfont\fi\itshape}{\ifvmode\par\fi }
\newenvironment{docDate}{}{\ifvmode\par\fi }
\newenvironment{docImprint}{\vskip 6pt}{\ifvmode\par\fi }
\newenvironment{docTitle}{\vskip6pt\bfseries\fontsize{22pt}{25pt}\selectfont}{\par }
\newenvironment{msHead}{\vskip 6pt}{\par}
\newenvironment{msItem}{\vskip 6pt}{\par}
\newenvironment{rubric}{}{}
\newenvironment{titlePart}{}{\par }

\newcolumntype{L}[1]{){\raggedright\arraybackslash}p{#1}}
\newcolumntype{C}[1]{){\centering\arraybackslash}p{#1}}
\newcolumntype{R}[1]{){\raggedleft\arraybackslash}p{#1}}
\newcolumntype{P}[1]{){\arraybackslash}p{#1}}
\newcolumntype{B}[1]{){\arraybackslash}b{#1}}
\newcolumntype{M}[1]{){\arraybackslash}m{#1}}
\definecolor{label}{gray}{0.75}
\def\unusedattribute#1{\sout{\textcolor{label}{#1}}}
\DeclareRobustCommand*{\xref}{\hyper@normalise\xref@}
\def\xref@#1#2{\hyper@linkurl{#2}{#1}}
\begingroup
\catcode`\_=\active
\gdef_#1{\ensuremath{\sb{\mathrm{#1}}}}
\endgroup
\mathcode`\_=\string"8000
\catcode`\_=12\relax

\usepackage[a4paper,twoside,lmargin=1in,rmargin=1in,tmargin=1in,bmargin=1in,marginparwidth=0.75in]{geometry}
\usepackage{framed}

\definecolor{shadecolor}{gray}{0.95}
\usepackage{longtable}
\usepackage[normalem]{ulem}
\usepackage{fancyvrb}
\usepackage{fancyhdr}
\usepackage{graphicx}
\usepackage{marginnote}

\renewcommand{\@cite}[1]{#1}


\renewcommand*{\marginfont}{\itshape\footnotesize}

\def\Gin@extensions{.pdf,.png,.jpg,.mps,.tif}

  \pagestyle{fancy}

\usepackage[pdftitle={Annotated Bangla News Corpus and Lexicon Development with POS Tagging and Stemming},
 pdfauthor={}]{hyperref}
\hyperbaseurl{}

	 \paperwidth210mm
	 \paperheight297mm
              
\def\@pnumwidth{1.55em}
\def\@tocrmarg {2.55em}
\def\@dotsep{4.5}
\setcounter{tocdepth}{3}
\clubpenalty=8000
\emergencystretch 3em
\hbadness=4000
\hyphenpenalty=400
\pretolerance=750
\tolerance=2000
\vbadness=4000
\widowpenalty=10000

\renewcommand\section{\@startsection {section}{1}{\z@}%
     {-1.75ex \@plus -0.5ex \@minus -.2ex}%
     {0.5ex \@plus .2ex}%
     {\reset@font\Large\bfseries}}
\renewcommand\subsection{\@startsection{subsection}{2}{\z@}%
     {-1.75ex\@plus -0.5ex \@minus- .2ex}%
     {0.5ex \@plus .2ex}%
     {\reset@font\Large}}
\renewcommand\subsubsection{\@startsection{subsubsection}{3}{\z@}%
     {-1.5ex\@plus -0.35ex \@minus -.2ex}%
     {0.5ex \@plus .2ex}%
     {\reset@font\large}}
\renewcommand\paragraph{\@startsection{paragraph}{4}{\z@}%
     {-1ex \@plus-0.35ex \@minus -0.2ex}%
     {0.5ex \@plus .2ex}%
     {\reset@font\normalsize}}
\renewcommand\subparagraph{\@startsection{subparagraph}{5}{\parindent}%
     {1.5ex \@plus1ex \@minus .2ex}%
     {-1em}%
     {\reset@font\normalsize\bfseries}}


\def\l@section#1#2{\addpenalty{\@secpenalty} \addvspace{1.0em plus 1pt}
 \@tempdima 1.5em \begingroup
 \parindent \z@ \rightskip \@pnumwidth 
 \parfillskip -\@pnumwidth 
 \bfseries \leavevmode #1\hfil \hbox to\@pnumwidth{\hss #2}\par
 \endgroup}
\def\l@subsection{\@dottedtocline{2}{1.5em}{2.3em}}
\def\l@subsubsection{\@dottedtocline{3}{3.8em}{3.2em}}
\def\l@paragraph{\@dottedtocline{4}{7.0em}{4.1em}}
\def\l@subparagraph{\@dottedtocline{5}{10em}{5em}}
\@ifundefined{c@section}{\newcounter{section}}{}
\@ifundefined{c@chapter}{\newcounter{chapter}}{}
\newif\if@mainmatter 
\@mainmattertrue
\def\chaptername{Chapter}
\def\frontmatter{%
  \pagenumbering{roman}
  \def\thechapter{\@roman\c@chapter}
  \def\theHchapter{\roman{chapter}}
  \def\thesection{\@roman\c@section}
  \def\theHsection{\roman{section}}
  \def\@chapapp{}%
}
\def\mainmatter{%
  \cleardoublepage
  \def\thechapter{\@arabic\c@chapter}
  \setcounter{chapter}{0}
  \setcounter{section}{0}
  \pagenumbering{arabic}
  \setcounter{secnumdepth}{6}
  \def\@chapapp{\chaptername}%
  \def\theHchapter{\arabic{chapter}}
  \def\thesection{\@arabic\c@section}
  \def\theHsection{\arabic{section}}
}
\def\backmatter{%
  \cleardoublepage
  \setcounter{chapter}{0}
  \setcounter{section}{0}
  \setcounter{secnumdepth}{2}
  \def\@chapapp{\appendixname}%
  \def\thechapter{\@Alph\c@chapter}
  \def\theHchapter{\Alph{chapter}}
  \appendix
}
\newenvironment{bibitemlist}[1]{%
   \list{\@biblabel{\@arabic\c@enumiv}}%
       {\settowidth\labelwidth{\@biblabel{#1}}%
        \leftmargin\labelwidth
        \advance\leftmargin\labelsep
        \@openbib@code
        \usecounter{enumiv}%
        \let\p@enumiv\@empty
        \renewcommand\theenumiv{\@arabic\c@enumiv}%
	}%
  \sloppy
  \clubpenalty4000
  \@clubpenalty \clubpenalty
  \widowpenalty4000%
  \sfcode`\.\@m}%
  {\def\@noitemerr
    {\@latex@warning{Empty `bibitemlist' environment}}%
    \endlist}

\def\tableofcontents{\section*{\contentsname}\@starttoc{toc}}
\parskip0pt
\parindent1em
\def\Panel#1#2#3#4{\multicolumn{#3}{){\columncolor{#2}}#4}{#1}}
\newenvironment{reflist}{%
  \begin{raggedright}\begin{list}{}
  {%
   \setlength{\topsep}{0pt}%
   \setlength{\rightmargin}{0.25in}%
   \setlength{\itemsep}{0pt}%
   \setlength{\itemindent}{0pt}%
   \setlength{\parskip}{0pt}%
   \setlength{\parsep}{2pt}%
   \def\makelabel##1{\itshape ##1}}%
  }
  {\end{list}\end{raggedright}}
\newenvironment{sansreflist}{%
  \begin{raggedright}\begin{list}{}
  {%
   \setlength{\topsep}{0pt}%
   \setlength{\rightmargin}{0.25in}%
   \setlength{\itemindent}{0pt}%
   \setlength{\parskip}{0pt}%
   \setlength{\itemsep}{0pt}%
   \setlength{\parsep}{2pt}%
   \def\makelabel##1{\upshape ##1}}%
  }
  {\end{list}\end{raggedright}}
\newenvironment{specHead}[2]%
 {\vspace{20pt}\hrule\vspace{10pt}%
  \phantomsection\label{#1}\markright{#2}%

  \pdfbookmark[2]{#2}{#1}%
  \hspace{-0.75in}{\bfseries\fontsize{16pt}{18pt}\selectfont#2}%
  }{}
      \def\TheFullDate{2017-01-15 (revised: 15 January 2017)}
\def\TheID{\makeatother }
\def\TheDate{2017-01-15}
\title{Annotated Bangla News Corpus and Lexicon Development with POS Tagging and Stemming}
\author{}\makeatletter 
\makeatletter
\newcommand*{\cleartoleftpage}{%
  \clearpage
    \if@twoside
    \ifodd\c@page
      \hbox{}\newpage
      \if@twocolumn
        \hbox{}\newpage
      \fi
    \fi
  \fi
}
\makeatother
\makeatletter
\thispagestyle{empty}
\markright{\@title}\markboth{\@title}{\@author}
\renewcommand\small{\@setfontsize\small{9pt}{11pt}\abovedisplayskip 8.5\p@ plus3\p@ minus4\p@
\belowdisplayskip \abovedisplayskip
\abovedisplayshortskip \z@ plus2\p@
\belowdisplayshortskip 4\p@ plus2\p@ minus2\p@
\def\@listi{\leftmargin\leftmargini
               \topsep 2\p@ plus1\p@ minus1\p@
               \parsep 2\p@ plus\p@ minus\p@
               \itemsep 1pt}
}
\makeatother
\fvset{frame=single,numberblanklines=false,xleftmargin=5mm,xrightmargin=5mm}
\fancyhf{} 
\setlength{\headheight}{14pt}
\fancyhead[LE]{\bfseries\leftmark} 
\fancyhead[RO]{\bfseries\rightmark} 
\fancyfoot[RO]{}
\fancyfoot[CO]{\thepage}
\fancyfoot[LO]{\TheID}
\fancyfoot[LE]{}
\fancyfoot[CE]{\thepage}
\fancyfoot[RE]{\TheID}
\hypersetup{citebordercolor=0.75 0.75 0.75,linkbordercolor=0.75 0.75 0.75,urlbordercolor=0.75 0.75 0.75,bookmarksnumbered=true}
\fancypagestyle{plain}{\fancyhead{}\renewcommand{\headrulewidth}{0pt}}

\date{}
\usepackage{authblk}

\providecommand{\keywords}[1]
{
\footnotesize
  \textbf{\textit{Index terms---}} #1
}

\usepackage{graphicx,xcolor}
\definecolor{GJBlue}{HTML}{273B81}
\definecolor{GJLightBlue}{HTML}{0A9DD9}
\definecolor{GJMediumGrey}{HTML}{6D6E70}
\definecolor{GJLightGrey}{HTML}{929497} 

\renewenvironment{abstract}{%
   \setlength{\parindent}{0pt}\raggedright
   \textcolor{GJMediumGrey}{\rule{\textwidth}{2pt}}
   \vskip16pt
   \textcolor{GJBlue}{\large\bfseries\abstractname\space}
}{%   
   \vskip8pt
   \textcolor{GJMediumGrey}{\rule{\textwidth}{2pt}}
   \vskip16pt
}

\usepackage[absolute,overlay]{textpos}

\makeatother 
      \usepackage{lineno}
      \linenumbers
      
\begin{document}

             \author[1]{Abdul  Matin}

             \author[2]{Tasnim Haider  Chaudhury}

             \author[3]{M.S.  Hossain}

             \affil[1]{  }

\renewcommand\Authands{ and }

\date{\small \em Received: 15 December 2016 Accepted: 2 January 2017 Published: 15 January 2017}

\maketitle


\begin{abstract}
        

In this paper, we have developed a mono-linguistic Bengali news corpus using knowledge based AI (Artificial Intelligence) technique from some widely read Bengali newspapers which will be used as a reference corpus and will be very useful for lexicon development, morphological analysis, and automatic parts of speech detection. The corpus contains 74,698 word forms. The words in the lexicon are annotated with a combination of manual tags addressing Parts-of-Speech, Stemming, Morphemes, and other grammatical features are very important for almost all Natural Language Processing (NLP) applications. The lexicon contains around 14 thousand entries. In this paper we present some statistical analysis on some Bengali newspapers Prothom-Alo, Daily Janakantha, Daily Kalerkantho and Amardesh online from 1st January, 2012 to 31st January, 2012 those are the most popular Bengali newspapers in Bangladesh. We proposed a user friendly software interface to the user to annotate a large existing Bengali word set for the lexicon build up process.

\end{abstract}


\keywords{corpus, POS, tagging, stemming, lexicon.}

\begin{textblock*}{18cm}(1cm,1cm) % {block width} (coords) 
\textcolor{GJBlue}{\LARGE Global Journals \LaTeX\ JournalKaleidoscope\texttrademark}
\end{textblock*}

\begin{textblock*}{18cm}(1.4cm,1.5cm) % {block width} (coords) 
\textcolor{GJBlue}{\footnotesize \\ Artificial Intelligence formulated this projection for compatibility purposes from the original article published at Global Journals. However, this technology is currently in beta. \emph{Therefore, kindly ignore odd layouts, missed formulae, text, tables, or figures.}}
\end{textblock*}


\let\tabcellsep& 	 	 		 
\section[{Introduction}]{Introduction}\par
he significance of large annotated corpus is a widely known fact. It is an important tool for researchers in Machine Translation (MT), Information Retrieval (IR), Speech Processing, Knowledge Based Computer System and Natural Language Processing (NLP). But in Bengali language we do not have large annotated corpus. The development of corpus creation and distribution of language resources and its availability is must for enhancing Language processing capabilities and research in this field \hyperref[b0]{[1]}. A corpus is also an essential language resource for creating automatic dictionary from a huge collection of language text \hyperref[b1]{[2]}. It is the central repository of data for all language processing applications. Researchers are taking this field as a huge sector of researching. In  {\ref [3, 4, and 5}] they focus on automatic Bangla corpus creation by combination of Bangla font. It contains information for human consumption as well as computer programs. The book of "Corpus linguistics and Language Technology" \hyperref[b5]{[6]} is a warehouse for corpus related studies with special attention to Bangla, where discussed almost every linguistic features of this language. In this paper, we are trying to present a reference corpus and a new approach into language investigation to understand how a text corpus database is utilized to obtain new result on a language or its properties. Bangla Corpus can be used for several purposes including spell checkers and morphological analysis for Bangla language.\par
A Bangla corpus can be extracted systematically from a Bangla corpus since it is considered a source of all words which is important for verification of Bangla sentence structure \hyperref[b6]{[7,}\hyperref[b7]{8]}. This paper proposes another process to manually build up a corpus, which is essentially a list of all words in the language and tag the words sufficiently with features such as word meaning, Parts-Of-Speech (POS) and all other grammatical features. All these information need to be stored in a database and properly formatted before display to end users. The aim of the project is to formalize a procedure for a collaborative effort by different individuals or groups towards producing a tagged Bangla corpus. This requires a POS tagging interface, both web based and standalone that would provide a common platform for different contributors to enter tag information, semantic and other grammatical information that is available in a dictionary. So we used this huge and mighty source as our source of our corpus data. 
\section[{II.}]{II.} 
\section[{Web as Corpus Source}]{Web as Corpus Source}\par
The use of the web as a corpus for teaching and research on language has been proposed a number of times. There has been a special issue of the Computational Linguistics journal on Web as Corpus. Several studies have used different methods to mine web data. Our attempt was to create an annotated Bangla text corpus which will contain Bangla text from most popular and well read newspapers of Bangladesh based on date (1 st January, 2012 to 31 st January, 2012) and several categories (Sports, Crime, Editorial, International, National, State, Sports, Business), so as to make it representative of every linguistic phenomena of Bangla. This project was based on huge data or text available in electronic format. As we are lacking good Bangla OCR applications for collecting Bangla text from printed book, journal and newspaper, so we had to restrict our attempt to collect corpus text from whatever resources we have available mainly from web. We have selected four newspapers that available in online and we used these in order to create a news corpus. These news corpuses contain 74698 word tokens and 13550 distinct words in this corpus. 
\section[{III.}]{III.} 
\section[{Data Collection a) Collection of the Raw Text}]{Data Collection a) Collection of the Raw Text}\par
Many newspaper in our country have own online versions, but we choose four newspapers Prothom-Alo, Daily Janakantha, Daily Kalerkantho and Amardesh online among them mainly because they are the widely read newspapers in Bangladesh and with less spelling mistakes. We consider the raw text of those newspaper available in web and download all news from web for collecting corpus The raw text for the corpus was collected from these newspapers through downloading all the news available for the year of 2012 (from 1st January to 31st January) including magazines and periodicals, which were all in html format. The process took about one month to collect all these available data manually. At this point we ended up with news of thirty days with each day having several text files that contained news of different genres. The corpus size is eighty megabytes. 
\section[{b) Conversion to UTF-8 Format}]{b) Conversion to UTF-8 Format}\par
Then we manually convert these entire text file to UTF-8 format to make these data available and correctly readable for our corpus creator program which only allow UTF-8 formatted text for processing. 
\section[{c) Classification of Collected Data}]{c) Classification of Collected Data}\par
For quick extraction of information of a UTF-8 formatted Bangla text file, we save these text files in some arranged folder format where name of these arranged folders will give information such as date of collection, source and source type, genre of data. Consider the following example like E:\textbackslash 08102011\textbackslash news.prothom-alocom\textbackslash crime\textbackslash file\textunderscore 001.txt that shown in Fig. 1. 
\section[{d) Database/System Design}]{d) Database/System Design}\par
We   
\section[{Text Pre-Processing}]{Text Pre-Processing}\par
In this step, remove following information from the chosen data as a pre-processing -A. First of all, we replace English letters and numeric alphabet both in Bangla and English from the UTF text data by a single space.\par
str=str.replaceAll("[[\textbackslash u0000-\textbackslash u007F][\textbackslash u09E6-\textbackslash u09EF]?""\&\&[\textasciicircum ?!?''-\textbackslash \textbackslash .]]"," ");\par
B. Then we replace all punctuation marks (expect Purnacched, Colon, Question Mark, Exclamation mark, Apostrophe, Dash/Hyphen, Dot) by a single space.str=str.replaceAll("[\textbackslash \textbackslash p\{Punct\}\&\&[\textasciicircum \textbackslash \textbackslash .?!?'-]]"," "); C. We also replace all Dots by a single space except those which is preceded by alphabet and followed by space. 
\section[{Data Collection}]{Data Collection}\par
Newspaper's website (www.prothom-alo.com) D. We also replace all Dash/Hyphen by a single space except those which is preceded and followed by alphabet. str = str.replaceAll("-| -|-"," "); 
\section[{Source of}]{Source of}\par
????????-????,?? ??? ????????????????????? ?????? ? ????????? (Here Dash/Hyphen has not eliminated) ?????????????????????????????????????????? ?? ????????????????????????????????????? (Here Dash/Hyphen has eliminated) E. We also replace all Apostrophe/Inverse Comma/Quotation Mark by a single space except those which is preceded and followed by alphabet. str = str.replaceAll(" '| ' |' "," "); '?????????? ??????,??????????? ? ??????'(Here quotation mark has eliminated) ?????? '????????????? ???????????????????? ???????????????????????????? (Here Apostrophe has eliminated) F. We replace all sequences of single or more spaces by a single space. str = str.replaceAll("( )+", " "); G. Then output file will save as "D:\textbackslash PreProcessed\textunderscore File.txt" and this pre-processed text will be used for POS tagging, Stemming and Morphological Analysis. H. This pre-processed text file will be referred to as our Raw Corpus. This corpus will be useful to evaluate the performance of annotated text corpus and also will be used for input text of machine translation system.\par
We proposed a user-friendly software interface to the user to annotate a large existing Bangla word set for the lexicon build up process. The effort will be a significant progress towards development of a properly annotated lexicon. This user interface has two distinct parts -one for building corpus and show text information such as source, source type, category, date, title and news content. Another parts for issuing manual addressing Parts-of-Speech, Stemming, Morphemes, issue clitic, ambiguous condition and other grammatical features.\par
A supervised machine learning method has been used for lexicon development from the Bengali news corpus. No extensive knowledge about the language is required except the knowledge of the different inflections that can appear with the different words in Bengali. To make proper annotation of word form, we accomplished each word form with POS tag, stem form, suffix, prefix, ambiguous condition (if exists), statistical counting. Initially, all the words (infected and uninfected) are extracted from the pre-processed text and added to a database with proper POS, stem, prefix, and suffix. The system retrieves the words from the preprocessed text and creates a database of distinct word forms with fully annotation. Here is given the structure of lexicon development process and some sample word forms shown in Fig.  {\ref 2}. 
\section[{Fig.2: Structure of Lexicon development}]{Fig.2: Structure of Lexicon development}\par
The Part-of-Speech (POS) tagging is the process of assigning each word of a text with an appropriate parts of speech tag. POS tags often signify the morphological \hyperref[b8]{[9]}, phonological and contextual properties of a word, and also provide information about neighboring words. In Bengali, there are five different POS namely, noun, pronoun, verb, adjective, and indeclinable (prepositions, con-junctions, and interjections). Noun, verb and adjective belong to the open class of POS in Bengali. In this lexicon analysis, we use seven parts-ofspeech by extraction main five parts-of-speech in Bangla language. As, we know, there are a lot of word with proper noun are used in Bangla language. So we keep the proper noun distinct from other noun to get us more detail of the word form which is in the range of noun. To handle clitic which is one of the most common ambiguous situation in natural language Processing (NLP), we define a new POS form named clitic. If we add a word form in our lexicon database without setting any POS to that word form, our corpus creator software automatically set its POS as UNKNOWN. Noun and verb words are tagged by looking at their infections. Some infections may be common to some word form. In these cases, more than one POS may be generated for few words form. But here we set the mechanism for only one POS of a word form.\par
We only suggest a procedure to handle this ambiguity for initial level where POS ambiguity is resolved by checking he number of occurrences of these possible root words along with the POS tags as derived from same word forms. Pronoun and indeclinable are basically closed class of POS \hyperref[b9]{[10]} in Bengali and these are added to the lexicon manually. It has been observed that adjectives in Bengali generally occur in four different forms based on the suffixes attached. For simplicity of counting or detecting sentence, we propose a user define POS named EOL (end of line). Here we detect POS for Purnacched, Question Mark, and Exclamation mark as EOL. The short description of POS categories is given in Table  {\ref I}.\par
Stemming is an operation that splits a word into the constituent root part and affix without doing complete morphological analysis. It is used to improve the performance of spelling checkers and information retrieval applications, where morphological analysis would be too computationally expensive. Terms with common stems tend to have similar meaning. So it can drastically reduce the dictionary sized used in various NLP applications, especially for highly inflected languages. We handle this stemming process manually like previous POS tagging process. After tokenizing preprocessed text into individual word form we manually set root of that word form by removing prefix and suffix of it. Then this stem form is stored in STEM field of lexicon database.\par
In our process, we first stripped off the suffix part from Bengali words depending upon the type of suffixes. Then we checked for the validity of the suffix stripped word as root word, using a Bengali dictionary. If it is not sufficient we strip the affix part of the remaining part of the word form. It can bring a set of word with same root form in a series to learn about them easily. We can get almost similar word by retrieving word which has same root/stem form.\par
A smallest meaningful linguistic unit is consisting of a word or a word element that can't be divided into smaller meaningful parts. At the time we set stem word for a word from, we also store the stripped part of the word form as morphemes. First of all we split the suffix part of a word and store it in out lexicon database. Then we check out remaining part of our word form whether there is any prefix part of that word. If exist, we strip it from the remaining word part and sore it to database by fully manually.\par
A Corpus from linguistic point of view is defined as a collection of transcribed speech or written text compiled mainly to enhance linguistic research. The key resource to any linguistic research is a trained, annotated corpus which can elevate language processing capability such as automatic part of-speech tagging, machine translation, questionanswering, stemming etc.\par
We design and develop a view of annotated corpus which is mainly based on knowledge based representation (Knowledge based AI technique). Here we used our Lexicon as knowledge reference for our corpus. Our lexicon is the collection of word forms with fully annotated where each word form is accomplished with parts-of-speech, stem, Morphemes (suffix, prefix) and statistical counting. When we add a word form to corpus, we bring all the morphological and grammatical information from lexicon and add this information with that word form. Corpus procedure and flow are shown in Fig. 3  First of all we take pre-processed text as an input of our corpus creation. 2. Then tokenize this text into word forms. Then all these word forms is stored in an iterative list. 3. This iterative list is looped and gets each word forms as a sequence they were in pre-processed text. Then for each word, POS, Stem is brought from lexicon database and adds this information following the word form separating with for slash (/). Then we make a small change in lexicon, we increase the count value of lexicon by 1 of a word from each time we find this word form. This helps us to find the number of occurrence of a word form. 4. For defining end of a sentence, we use EOL as word form and EOL as POS.\par
? If sentence is an Assertive sentence, we use as stem. Example: ???????/UNK/UNK 6. Before adding tokenized word form of preprocessed text to corpus, we add the entire information associate with this new text with some predefine TAG to raw corpus. Tag format of our news corpus has given in Table  {\ref II}. 
\section[{a) Statistical Analysis}]{a) Statistical Analysis}\par
Regardless of the size of the corpus, it may subjected to both qualitative as well as quantitative analysis using various methods of statistics . Both these types of corpus analysis have different perspectives. Quantitative analysis focuses classifying different linguistic properties where qualitative analysis aims to give some complete and detailed description of the observed phenomena. We wish to focus on some simple quantitative analysis using U-Gram model.\par
We develop our corpus development program in such efficient away where researcher can easily get a lot of common and most focused perspective statistical output without any further processing. Here also some user define output generator where user can get output with is desire requirement.\par
Here we divide our statistical output generator procedure in two distinct parts:\par
? One for automated query based information.\par
? Another for user defines query based information.\par
As result of automated query base perspective statistical output, we provide twelve statistical counting results. This type of statistical counting will be very helpful for linguistic analysis, machine translation, Morphological analysis, spelling variations, morphological structure, and word sense analysis. These statistical counting are,\par
? Number of source from where this corpus data collected and there list.\par
? Number of source type and their list of this source of data.\par
Table  {\ref II}  Corpus is considered as basic resource for language analysis and research for many foreign languages. This reflects both ideological and technological change in the area of language research. The effort will be a significant progress towards development of a properly annotated lexicon. The outcome of the research will significantly be helpful for future analyzer in the processes of Morphological Analysis, Automatic grammar Extraction and Machine Translation for Bangla.  
\section[{Global}]{Global}  \begin{figure}[htbp]
\noindent\textbf{} \par 
\begin{longtable}{P{0.3001919385796545\textwidth}P{0.12970249520153548\textwidth}P{0.006525911708253358\textwidth}P{0.07178502879078695\textwidth}P{0.0008157389635316698\textwidth}P{0.25858925143953937\textwidth}P{0.04486564299424184\textwidth}P{0.03752399232245681\textwidth}}
Structure:\tabcellsep \tabcellsep \tabcellsep \tabcellsep \tabcellsep \\
Example:\tabcellsep WORD/SETM/POS\tabcellsep \tabcellsep \multicolumn{2}{l}{Preprocessed Text}\tabcellsep \\
\multicolumn{6}{l}{????/???/ADJ/???????/?????/NN/????/? Individual Word form ??/NN/????/????/PRO/???? /???? /ADJ/????/???/RB/}\\
\multicolumn{4}{l}{????/???/ADV/?????/?????/NN/?????? ,????}\tabcellsep \tabcellsep \\
\multicolumn{4}{l}{?/ADJ/???-???/??? ???/NN/??????/?????/NN/??????????/???????/N Write "WORD/"}\tabcellsep 1\tabcellsep Get POS\\
\multicolumn{4}{l}{N/??????/??????/ADJ/???????/??????/NN/????/}\tabcellsep \tabcellsep \\
\multicolumn{4}{l}{????/PRO/?? ?/?? ?/NN/???/???/VRB/????/???/VRB/}\tabcellsep \tabcellsep \\
\multicolumn{2}{l}{???? / ???? /ADJ/??/??/VRB/EOL/AS/EOL}\tabcellsep Increase\tabcellsep \tabcellsep \tabcellsep Database\\
5.\tabcellsep \multicolumn{2}{l}{count by1}\tabcellsep \multicolumn{2}{l}{Get Steam form}\tabcellsep Is UNK?\tabcellsep No\tabcellsep Year 2017\\
\tabcellsep \tabcellsep \tabcellsep \tabcellsep \tabcellsep Yes\tabcellsep Write "POS/"\tabcellsep 9\\
\tabcellsep \tabcellsep \tabcellsep \multicolumn{2}{l}{Is UNK? Write "STEAM/"}\tabcellsep Get POS from User \& Add to Database Get Steam from User \& Add to Database\tabcellsep ( ) Volume XVII Issue I Version I J\\
\tabcellsep \multicolumn{2}{l}{Tag Description Noun(Except Proper Noun) Proper Noun Adjective Adverb Verb Pronoun Interjection) Indeclinable(Preposition, Conjunction \&}\tabcellsep \multicolumn{2}{l}{Tag Label NN PN ADJ ADV VRB PRO IND}\tabcellsep \multicolumn{2}{l}{Examples ????,??????, ?????, ????, ??? ???????,????????, ?????? ????, ???????,??, ??????, ????? ?????,??? ????,???????, ???????? ??????,??à¦?"??, ??????, ????, ??? ???,????,???????,?????? ???,????,???,????, ???, ?????}\tabcellsep Journal of Researches in Engineering\\
\tabcellsep \tabcellsep \tabcellsep \tabcellsep \tabcellsep \tabcellsep Global\end{longtable} \par
 
\caption{\label{tab_2}?}\end{figure}
 \begin{figure}[htbp]
\noindent\textbf{} \par 
\begin{longtable}{P{0.06186931348221671\textwidth}P{0.025310173697270472\textwidth}P{0.167328370554177\textwidth}P{0.5954921422663357\textwidth}}
\tabcellsep \tabcellsep \multicolumn{2}{l}{Annotated Bangla News Corpus and Lexicon Development with POS Tagging and Stemming}\\
Year 2017\tabcellsep \tabcellsep \\
10\tabcellsep \tabcellsep \\
XVII Issue I Version I\tabcellsep VII.\tabcellsep Experiment and Data Analysis\\
( ) Volume J\tabcellsep \tabcellsep Tag Name\tabcellsep Tag Description / Purpose\\
Global Journal of Researches in Engineering\tabcellsep \tabcellsep <ENTRY> <SOURCE></SOURCE> <TYPE></TYPE> <DATE></DATE> <CATAGORY></CATAGORY> <TITLE></TITLE> <CONTENT></CONTENT> </ENTRY>\tabcellsep Statistical counting of our annotated Bangla text provide some qualitative analysis aims to give some complete and detailed description of the observed corpus is shown in Table III. Our Corpus program also To define start of a new news information/data. Source of data. (www.prothom-alo.com) Source type of data (news, blog) Date of collection of data (11-01-12) Genres of that data (sports, crime) Title of news/data Main content of the news. To define end of this news information/data.\\
\tabcellsep \tabcellsep \tabcellsep phenomena which include word level frequency\\
\tabcellsep \tabcellsep \tabcellsep analysis, behavior of bangle word, use of non-Bangla\\
\tabcellsep \tabcellsep \tabcellsep word etc. These type of information can be get by using\\
\tabcellsep \tabcellsep \tabcellsep user defines query based annotated text corpus\\
\tabcellsep \tabcellsep \tabcellsep program interface.\\
\tabcellsep \tabcellsep \tabcellsep b) Word frequency Analysis\\
\tabcellsep \tabcellsep \tabcellsep Study of frequency calculation can provide\\
\tabcellsep \tabcellsep \tabcellsep important information about the usage of words in a\\
\tabcellsep \multicolumn{2}{l}{© 2017 Global Journals Inc. (US)}\end{longtable} \par
  {\small\itshape [Note: text]} 
\caption{\label{tab_3}:}\end{figure}
 \begin{figure}[htbp]
\noindent\textbf{III} \par 
\begin{longtable}{}
\end{longtable} \par
 
\caption{\label{tab_4}Table III :}\end{figure}
 \begin{figure}[htbp]
\noindent\textbf{IV} \par 
\begin{longtable}{}
\end{longtable} \par
 
\caption{\label{tab_5}Table IV :}\end{figure}
 \begin{figure}[htbp]
\noindent\textbf{V} \par 
\begin{longtable}{P{0.17098265895953757\textwidth}P{0.4058381502890173\textwidth}P{0.10514450867052022\textwidth}P{0.08057803468208091\textwidth}P{0.08745664739884393\textwidth}}
\multicolumn{4}{l}{Annotated Bangla News Corpus and Lexicon Development with POS Tagging and Stemming}\\
Serial\tabcellsep Information\tabcellsep \tabcellsep Count\\
No\tabcellsep \tabcellsep \tabcellsep \\
1\tabcellsep Number of source\tabcellsep \tabcellsep 4\\
2\tabcellsep Number of source type\tabcellsep \tabcellsep 1\\
3\tabcellsep \multicolumn{2}{l}{Number of fields/genres}\tabcellsep 19\\
4\tabcellsep \multicolumn{2}{l}{Number of Raw word/Number}\tabcellsep 74698\\
5 6 7\tabcellsep \multicolumn{2}{l}{Number of Unique word Number of Unique Stem word Total Number of Sentence}\tabcellsep 13550 1423 5472\tabcellsep Year 2017\\
8\tabcellsep \multicolumn{2}{l}{Number of Assertive Sentence}\tabcellsep 5377\tabcellsep 11\\
9 10 11 12 Word à¦?" ??\tabcellsep \multicolumn{2}{l}{Number of Interrogative Sentence. Number of Exclamatory Sentence. Number of Clitic Number of occurrence of Clitics Percentag e Word 1.78 ? 1.34 ??}\tabcellsep 72 23 3 136 Percenta ge 0.39 0.30\tabcellsep J ( ) Volume XVII Issue I Version I\\
??? ?? ??? ??? ?? Word ??? ?? ??? ????\tabcellsep 1.21 1.15 0.95 0.52 0.47 Percentage 0.4 0.25 0.23 0.20\tabcellsep \multicolumn{2}{l}{??? ???? ?? ??? ? ???? Word Percentage 0.30 0.26 0.17 0.08 0.069 ??? 0.16 ?? 0.15 ?? 0.13 ?? 0.13}\tabcellsep Global Journal of Researches in Engineering\\
????\tabcellsep 0.18\tabcellsep ??\tabcellsep 0.13\\
????\tabcellsep 0.16\tabcellsep ???\tabcellsep 0.13\\
???\tabcellsep 0.16\tabcellsep ??\tabcellsep 0.10\\
??\tabcellsep 0.16\tabcellsep ???\tabcellsep 0.10\end{longtable} \par
  {\small\itshape [Note: © 2017 Global Journals Inc. (US)]} 
\caption{\label{tab_6}Table V :}\end{figure}
 \begin{figure}[htbp]
\noindent\textbf{VI} \par 
\begin{longtable}{}
\end{longtable} \par
 
\caption{\label{tab_7}Table VI :}\end{figure}
 \begin{figure}[htbp]
\noindent\textbf{VII} \par 
\begin{longtable}{}
\end{longtable} \par
 
\caption{\label{tab_8}Table VII :}\end{figure}
 \begin{figure}[htbp]
\noindent\textbf{} \par 
\begin{longtable}{P{0.2521978021978022\textwidth}P{0.11208791208791209\textwidth}P{0.17747252747252748\textwidth}P{0.07472527472527472\textwidth}P{0.23351648351648352\textwidth}}
\tabcellsep \multicolumn{2}{l}{POS Name}\tabcellsep \multicolumn{2}{l}{Percentage}\\
\tabcellsep \tabcellsep NN\tabcellsep \tabcellsep 56.43\\
\tabcellsep \tabcellsep VRB\tabcellsep \tabcellsep 20.53\\
\tabcellsep \tabcellsep ADJ\tabcellsep \tabcellsep 16.41\\
\tabcellsep \tabcellsep PN\tabcellsep \tabcellsep 13.71\\
\tabcellsep \tabcellsep ADV\tabcellsep \tabcellsep 5.94\\
\tabcellsep \tabcellsep PRO\tabcellsep \tabcellsep 3.39\\
\tabcellsep \tabcellsep IND\tabcellsep \tabcellsep 1.98\\
\tabcellsep \tabcellsep CLK\tabcellsep \tabcellsep 0.104\\
\tabcellsep \tabcellsep UNK\tabcellsep \tabcellsep 1.35\\
Year 2017\tabcellsep Prefix\tabcellsep Percentage\tabcellsep Suffix\tabcellsep Percentage\\
12\tabcellsep ?\tabcellsep 9.30\tabcellsep ?\tabcellsep 15.70\\
I\tabcellsep ??\tabcellsep 4.07\tabcellsep ??\tabcellsep 15.43\\
J ( ) Volume XVII Issue I Version\tabcellsep Conclusion ?? ??? ?\tabcellsep 2.23 2.23 1.74\tabcellsep ? ? ?\tabcellsep 8.53 4.72 4.63\\
Journal of Researches in Engineering\tabcellsep \tabcellsep \tabcellsep \tabcellsep \end{longtable} \par
 
\caption{\label{tab_9}}\end{figure}
 			\footnote{© 2017 Global Journals Inc. (US)} 			\footnote{Year 2017 J} 		 		\backmatter  			  				\begin{bibitemlist}{1}
\bibitem[Toutanova and Cherry]{b9}\label{b9} 	 		‘A global model for joint lemmatization and part-of-speech prediction’.  		 			Kristina Toutanova 		,  		 			Colin Cherry 		.  	 	 		\textit{Proceeding on ACL '09 Proceedings of the Joint Conference of the 47th Annual Meeting of the ACL and the 4th International Joint Conference on Natural Language Processing of the AFNLP},  				 (eeding on ACL '09 eedings of the Joint Conference of the 47th Annual Meeting of the ACL and the 4th International Joint Conference on Natural Language essing of the AFNLP)  		1 p. .  	 
\bibitem[Md et al.]{b3}\label{b3} 	 		\textit{Analysis of and Observations from a Bangla News Corpus},  		 			Khair Md 		,  		 			Yeasir Arafat 		,  		 			Md Majumder 		,  		 			Naushad Islam 		,  		 			Mumit Uz Zaman 		,  		 			Khan 		.  		Dhaka, Bangladesh.  		 			Center for Research on Bangla Language Processing, BRAC University 		 	 
\bibitem[Asif Iqbal Sarkar et al.]{b2}\label{b2} 	 		\textit{Automatic Bangla Corpus Creation},  		 			Dewan Asif Iqbal Sarkar 		,  		 			Mumit Shahriar Hossain Pavel 		,  		 			Khan 		.  		Dhaka, Bangladesh.  		 			BRAC University 		 	 
\bibitem[Dash ()]{b5}\label{b5} 	 		\textit{Corpus Linguistics and Language Technology},  		 			N S Dash 		.  		2005. New Delhi.  	 	 (Mittal) 
\bibitem[Cieri and Liberman]{b0}\label{b0} 	 		\textit{Issues in Corpus Creation and Distribution: The Evolution of the Linguistic Data Consortium, University of Pennsylvania and Linguistic Data Consortium Philadelphia},  		 			C Cieri 		,  		 			M Liberman 		.  		Pennsylvania, USA.  	 
\bibitem[Hasan ()]{b1}\label{b1} 	 		\textit{Master's thesis, School of Computer Science and Information Technology},  		 			J Hasan 		.  		2001.  		 			RMIT University 		 	 	 (Automatic dictionary construction from large collections of text) 
\bibitem[M Asaduzzaman and Ali ()]{b8}\label{b8} 	 		‘Morphological Analysis of Bangla Words for Automatic Machine Translation’.  		 			M M Asaduzzaman 		,  		 			Muhammad Masroor Ali 		.  	 	 		\textit{th International Conference on Computer and Information Technology (ICCIT) 2003. Jahangirnagar University},  				 (Dhaka, Bangladesh)  		2003. p. .  	 
\bibitem[Md Hanif Seddiqui et al.]{b7}\label{b7} 	 		‘Parts of speech tagging using morphological analysis in bangla’.  		 			Md Hanif Seddiqui 		,  		 			Abdullah Al Rana 		,  		 			Taufique Mahmud 		,  		 			Sayeed 		.  	 	 		\textit{Proceeding of the 6th International Conference on Computer and Information Technology (ICCIT)},  				 (eeding of the 6th International Conference on Computer and Information Technology (ICCIT)Bangladesh)  		 	 
\bibitem[Bharati et al. (1998)]{b4}\label{b4} 	 		‘Some Observations Regarding Corpora of Some Indian Languages’.  		 			A Bharati 		,  		 			R Sangal 		,  		 			S M Bendre 		.  	 	 		\textit{Proc. Intl. Conf. Knowledge Based Computer Systems (KBCS98)},  				 (Intl. Conf. Knowledge Based Computer Systems (KBCS98)NCST, Mumbai)  		19 Dec. 1998. p. 17.  	 
\bibitem[Nur Hossain Khan et al. ()]{b6}\label{b6} 	 		‘Verification of Bangla Sentence Structure using N-Gram’.  		 			Md Nur Hossain Khan 		,  		 			Md Khan 		,  		 			Md Islam 		,  		 			Habibur Rahman 		,  		 			Bappa Sarker 		.  	 	 		\textit{Global Journal of Computer Science and Technology}  		2014. 14.  	 	 (Issue 1 Version 1.0 Year) 
\end{bibitemlist}
 			 		 	 
\end{document}