<rt id="bn8ez"></rt>
<label id="bn8ez"></label>

  • <span id="bn8ez"></span>

    <label id="bn8ez"><meter id="bn8ez"></meter></label>

    Terry.Li-彬

    虛其心,可解天下之問;專其心,可治天下之學(xué);靜其心,可悟天下之理;恒其心,可成天下之業(yè)。

      BlogJava :: 首頁 :: 新隨筆 :: 聯(lián)系 :: 聚合  :: 管理 ::
      143 隨筆 :: 344 文章 :: 130 評論 :: 0 Trackbacks
    基于詞典的正向最大匹配中文分詞算法,能實現(xiàn)中英文數(shù)字混合分詞。比如能分出這樣的詞:bb霜、3室、樂phone、touch4、mp3、T恤

    第一次寫中文分詞程序,歡迎拍磚。

    public?class?MM2?
    {
    ????
    private?static?final?Log?log?=?LogFactory.getLog(MM2.class);
    ????
    ????
    private?static?HashMap<String,?Integer>?dictionary?=?null;?
    ????
    private?static?final?int?WORD_MAX_LENGTH?=?9;
    ????
    private?Reader?reader;
    ????
    ????
    static
    ????
    {
    ????????loadDictionary();
    ????}

    ????
    ????
    public?MM2(Reader?reader)?
    ????
    {?
    ????????
    this.reader?=?reader;?
    ????}
    ?
    ????
    ????
    //切分出由中文、字母、數(shù)字組成的句子
    ????public?ArrayList<Sentence>?getSentence()?throws?IOException
    ????
    {???
    ????????ArrayList
    <Sentence>?list=new?ArrayList<Sentence>();
    ????????StringBuffer?cb
    =new?StringBuffer();
    ????????
    int?d=reader.read();
    ????????
    int?offset=0;
    ????????
    boolean?b=false;
    ????????
    while(d>-1)
    ????????
    {
    ????????????
    int?type=Character.getType(d);
    ????????????
    if(type==2?||?type==9?||?type==5)
    ????????????
    {
    ????????????????d
    =toAscii(d);
    ????????????????cb.append((
    char)d);
    ????????????}

    ????????????
    else
    ????????????
    {
    ????????????????b
    =true;
    ????????????}

    ????????????d
    =reader.read();
    ????????????
    if(d==-1?||?b)
    ????????????
    {
    ????????????????
    if(d==-1)?offset++;
    ????????????????b
    =false;
    ????????????????
    char[]?ioBuffer?=?new?char[cb.length()];
    ????????????????cb.getChars(
    0,?cb.length(),?ioBuffer,?0);
    ????????????????Sentence?sen
    =new?Sentence(ioBuffer,offset-cb.length());
    ????????????????list.add(sen);
    ????????????????cb.setLength(
    0);
    ????????????}

    ????????????offset
    ++;
    ????????}

    ????????
    return?list;
    ????}

    ????
    ????
    //將句子切分出詞
    ????public?ArrayList<Token>?getToken(ArrayList<Sentence>?list)?throws?IOException
    ????
    {
    ????????ArrayList
    <Token>?tokenlist=new?ArrayList<Token>();
    ????????
    for(Sentence?sen:list)
    ????????
    {
    ????????????StringBuffer?word?
    =?new?StringBuffer();
    ????????????
    int?offset=sen.getStartOffset();
    ????????????
    int?bufferIndex?=?0;
    ????????????
    char?c;
    ????????????
    boolean?b=false;
    ????????????
    while(bufferIndex<sen.getText().length)
    ????????????
    {
    ????????????????offset
    ++;
    ????????????????c
    =sen.getText()[bufferIndex++];
    ????????????????
    if(word.length()==0)
    ????????????????????word.append(c);
    ????????????????
    else
    ????????????????
    {
    ????????????????????String?temp?
    =?(word.toString()?+?c).intern();
    ????????????????????
    if(dictionary.containsKey(temp)?&&?dictionary.get(temp)==1)
    ????????????????????????word.append(c);
    ????????????????????
    else?if(dictionary.containsKey(temp)?&&?bufferIndex<sen.getText().length)
    ????????????????????????word.append(c);
    ????????????????????
    else
    ????????????????????
    {
    ????????????????????????bufferIndex
    --;
    ????????????????????????offset
    --;
    ????????????????????????
    while(word.length()>1?&&?dictionary.get(word.toString())!=null?&&?dictionary.get(word.toString())==2)
    ????????????????????????
    {
    ????????????????????????????word.deleteCharAt(word.length()
    -1);
    ????????????????????????????bufferIndex
    --;
    ????????????????????????????offset
    --;
    ????????????????????????}

    ????????????????????????b
    =true;
    ????????????????????}

    ????????????????}

    ????????????????
    if(b?||?bufferIndex==sen.getText().length)
    ????????????????
    {
    ????????????????????Token?token?
    =?new?Token(word.toString(),offset-word.length(),offset,"word");
    ????????????????????word.setLength(
    0);
    ????????????????????tokenlist.add(token);
    ????????????????????b
    =false;
    ????????????????}

    ????????????}

    ????????}

    ????????
    return?tokenlist;
    ????}

    ????
    ????
    //將相連的單個英文或數(shù)字組合成詞
    ????public?ArrayList<Token>?getNewToken(ArrayList<Token>?list)?throws?IOException
    ????
    {
    ????????ArrayList
    <Token>?tokenlist=new?ArrayList<Token>();
    ????????Token?word
    =null;
    ????????
    for(int?i=0;i<list.size();i++)
    ????????
    {
    ????????????Token?t
    =list.get(i);
    ????????????
    if(t.getWord().length()==1?&&?Character.getType((int)t.getWord().charAt(0))!=5)
    ????????????
    {
    ????????????????
    if(word==null)
    ????????????????????word
    =t;
    ????????????????
    else?if(word.getEnd()==t.getStart())
    ????????????????
    {
    ????????????????????word.setEnd(t.getEnd());
    ????????????????????word.setWord(word.getWord()
    +t.getWord());
    ????????????????}

    ????????????????
    else
    ????????????????
    {
    ????????????????????tokenlist.add(word);
    ????????????????????word
    =t;
    ????????????????}

    ????????????}

    ????????????
    else?if(word!=null)
    ????????????
    {
    ????????????????tokenlist.add(word);
    ????????????????word
    =null;
    ????????????????tokenlist.add(t);
    ????????????}

    ????????????
    else
    ????????????????tokenlist.add(t);
    ????????}

    ????????
    if(word!=null)
    ????????????tokenlist.add(word);
    ????????
    return?tokenlist;
    ????}

    ????
    ????
    //雙角轉(zhuǎn)單角
    ????public?static?int?toAscii(int?codePoint)?
    ????
    {
    ????????
    if((codePoint>=65296?&&?codePoint<=65305)????//0-9
    ????????????????||?(codePoint>=65313?&&?codePoint<=65338)????//A-Z
    ????????????????||?(codePoint>=65345?&&?codePoint<=65370)????//a-z
    ????????????????)
    ????????
    {????
    ????????????codePoint?
    -=?65248;
    ????????}

    ????????
    return?codePoint;
    ????}

    ????
    ????
    //加載詞典
    ????public?static?void?loadDictionary()?
    ????
    {??
    ????????
    if?(dictionary?==?null)?
    ????????
    {????
    ????????????dictionary?
    =?new?HashMap<String,?Integer>();????
    ????????????InputStream?is?
    =?null;????
    ????????????BufferedReader?br?
    =?null;????????????
    ????????????
    try
    ????????????
    {
    ????????????????is?
    =?new?FileInputStream(new?File(MM2.class.getClassLoader().getResource("dictionary.txt").toURI()));
    ????????????????br?
    =?new?BufferedReader(new?InputStreamReader(is,?"UTF-8"));
    ????????????????String?word?
    =?null;
    ????????????????
    while?((word?=?br.readLine())?!=?null)?
    ????????????????
    {
    ????????????????????word
    =word.toLowerCase();
    ????????????????????
    if?((word.indexOf("#")?==?-1)?&&?(word.length()?<=?WORD_MAX_LENGTH))
    ????????????????????
    {
    ????????????????????????dictionary.put(word.intern(),?
    1);????
    ????????????????????????
    int?i?=?word.length()-1;?
    ????????????????????????
    while(i?>=?2)
    ????????????????????????
    {
    ????????????????????????????String?temp?
    =?word.substring(0,?i).intern();?
    ????????????????????????????
    if?(!dictionary.containsKey(temp))
    ????????????????????????????????dictionary.put(temp,
    2);?
    ????????????????????????????i
    --;
    ????????????????????????}

    ????????????????????}

    ????????????????}

    ????????????}

    ????????????
    catch?(Exception?e)?
    ????????????
    {??????
    ????????????????log.info(e);
    ????????????}

    ????????????
    finally
    ????????????
    {
    ????????????????
    try?
    ????????????????
    {??????
    ????????????????????
    if(br!=null)
    ????????????????????????br.close();???
    ????????????????????
    if(is!=null)
    ????????????????????????is.close();??
    ????????????????}

    ????????????????
    catch?(IOException?e)
    ????????????????
    {?????
    ????????????????????log.info(e);
    ????????????????}
    ????????????
    ????????????}
    ?
    ????????}
    ?
    ????}

    ????
    ????
    public?static?String[]?segWords(Reader?input)
    ????
    {
    ????????ArrayList
    <String>?list=new?ArrayList<String>();
    ????????
    try
    ????????
    {
    ????????????MM2?f
    =new?MM2(input);
    ????????????ArrayList
    <Token>?tlist=?f.getNewToken(f.getToken(f.getSentence()));
    ????????????
    for(Token?t:tlist)
    ????????????
    {
    ????????????????list.add(t.getWord());
    ????????????}

    ????????}

    ????????
    catch(IOException?e)
    ????????
    {
    ????????????log.info(e);
    ????????}

    ????????
    return?(String[])list.toArray(new?String[0]);
    ????}

    ????
    ????
    public?static?void?main(String[]?args)?
    ????
    {
    ????????String[]?cc
    =MM2.segWords(new?StringReader("ibm商務(wù)機t60p".toLowerCase()));
    ????????
    for(String?c:cc)
    ????????
    {
    ????????????System.out.println(c);
    ????????}

    ????}

    }
    posted on 2011-08-05 08:34 禮物 閱讀(2118) 評論(2)  編輯  收藏

    評論

    # re: 基于詞典的正向最大匹配中文分詞算法,能實現(xiàn)中英文數(shù)字混合分詞 2013-07-25 22:09 yi
    這是全的么,樓主?我導(dǎo)入到MyEclipse里好多錯誤呀,除了import包之外還有好多錯,看不懂。。。  回復(fù)  更多評論
      

    # re: 基于詞典的正向最大匹配中文分詞算法,能實現(xiàn)中英文數(shù)字混合分詞 2013-08-22 20:01 love code
    麻煩 博主把dictionary.txt發(fā)給我吧,讓我學(xué)習學(xué)習
    1182787467@qq.com
    謝謝  回復(fù)  更多評論
      


    只有注冊用戶登錄后才能發(fā)表評論。

    網(wǎng)站導(dǎo)航:
    博客園   IT新聞   Chat2DB   C++博客   博問  
     
    主站蜘蛛池模板: 亚洲精华液一二三产区| 亚洲精品无码久久久久YW| 美女黄色免费网站| 国产禁女女网站免费看| 亚洲av无码专区在线电影天堂 | 国产在线a免费观看| 久久久久亚洲AV成人片| 国产大片免费网站不卡美女| 亚洲国产高清人在线| 亚洲黄色免费观看| 在线观看亚洲AV日韩A∨| 国产极品美女高潮抽搐免费网站 | 免费夜色污私人影院网站| 亚洲精品国产V片在线观看| 久久国产精品免费一区二区三区| 久久久无码精品亚洲日韩软件| 中文字幕在线视频免费| 亚洲国产综合91精品麻豆| 丁香花免费高清视频完整版| 久久精品国产亚洲av天美18| 亚洲日本韩国在线| 日韩精品无码一区二区三区免费 | 无码天堂亚洲国产AV| 亚洲中文字幕久久精品无码APP | 国产亚洲福利精品一区| 最近中文字幕免费2019| 亚洲国产精品无码久久98 | 亚洲成a人片在线观看中文!!!| 成人黄18免费视频| 四虎影视久久久免费观看| 久久久久亚洲AV无码网站| 日韩毛片无码永久免费看| 两个人日本免费完整版在线观看1| 久久精品国产亚洲av高清漫画| 毛片免费vip会员在线看| 国产va免费观看| 亚洲13又紧又嫩又水多| 亚洲一级Av无码毛片久久精品| 最近高清中文字幕免费| 老湿机一区午夜精品免费福利| 亚洲日本va午夜中文字幕一区|