<rt id="bn8ez"></rt>
<label id="bn8ez"></label>

  • <span id="bn8ez"></span>

    <label id="bn8ez"><meter id="bn8ez"></meter></label>

    隨筆-23  評(píng)論-58  文章-0  trackbacks-0
    基于詞典的正向最大匹配中文分詞算法,能實(shí)現(xiàn)中英文數(shù)字混合分詞。比如能分出這樣的詞:bb霜、3室、樂(lè)phone、touch4、mp3、T恤

    第一次寫(xiě)中文分詞程序,歡迎拍磚。

    查看第2版:正向最大匹配分詞程序,能實(shí)現(xiàn)中英文數(shù)字混合分詞 (第二版)

    public class MM2 
    {
        
    private static final Log log = LogFactory.getLog(MM2.class);
        
        
    private static HashMap<String, Integer> dictionary = null
        
    private static final int WORD_MAX_LENGTH = 9;
        
    private Reader reader;
        
        
    static
        
    {
            loadDictionary();
        }

        
        
    public MM2(Reader reader) 
        

            
    this.reader = reader; 
        }
     
        
        
    //切分出由中文、字母、數(shù)字組成的句子
        public ArrayList<Sentence> getSentence() throws IOException
        
    {   
            ArrayList
    <Sentence> list=new ArrayList<Sentence>();
            StringBuffer cb
    =new StringBuffer();
            
    int d=reader.read();
            
    int offset=0;
            
    boolean b=false;
            
    while(d>-1)
            
    {
                
    int type=Character.getType(d);
                
    if(type==2 || type==9 || type==5)
                
    {
                    d
    =toAscii(d);
                    cb.append((
    char)d);
                }

                
    else
                
    {
                    b
    =true;
                }

                d
    =reader.read();
                
    if(d==-1 || b)
                
    {
                    
    if(d==-1) offset++;
                    b
    =false;
                    
    char[] ioBuffer = new char[cb.length()];
                    cb.getChars(
    0, cb.length(), ioBuffer, 0);
                    Sentence sen
    =new Sentence(ioBuffer,offset-cb.length());
                    list.add(sen);
                    cb.setLength(
    0);
                }

                offset
    ++;
            }

            
    return list;
        }

        
        
    //將句子切分出詞
        public ArrayList<Token> getToken(ArrayList<Sentence> list) throws IOException
        
    {
            ArrayList
    <Token> tokenlist=new ArrayList<Token>();
            
    for(Sentence sen:list)
            
    {
                StringBuffer word 
    = new StringBuffer();
                
    int offset=sen.getStartOffset();
                
    int bufferIndex = 0;
                
    char c;
                
    boolean b=false;
                
    while(bufferIndex<sen.getText().length)
                
    {
                    offset
    ++;
                    c
    =sen.getText()[bufferIndex++];
                    
    if(word.length()==0)
                        word.append(c);
                    
    else
                    
    {
                        String temp 
    = (word.toString() + c).intern();
                        
    if(dictionary.containsKey(temp) && dictionary.get(temp)==1)
                            word.append(c);
                        
    else if(dictionary.containsKey(temp) && bufferIndex<sen.getText().length)
                            word.append(c);
                        
    else
                        
    {
                            bufferIndex
    --;
                            offset
    --;
                            
    while(word.length()>1 && dictionary.get(word.toString())!=null && dictionary.get(word.toString())==2)
                            
    {
                                word.deleteCharAt(word.length()
    -1);
                                bufferIndex
    --;
                                offset
    --;
                            }

                            b
    =true;
                        }

                    }

                    
    if(b || bufferIndex==sen.getText().length)
                    
    {
                        Token token 
    = new Token(word.toString(),offset-word.length(),offset,"word");
                        word.setLength(
    0);
                        tokenlist.add(token);
                        b
    =false;
                    }

                }

            }

            
    return tokenlist;
        }

        
        
    //將相連的單個(gè)英文或數(shù)字組合成詞
        public ArrayList<Token> getNewToken(ArrayList<Token> list) throws IOException
        
    {
            ArrayList
    <Token> tokenlist=new ArrayList<Token>();
            Token word
    =null;
            
    for(int i=0;i<list.size();i++)
            
    {
                Token t
    =list.get(i);
                
    if(t.getWord().length()==1 && Character.getType((int)t.getWord().charAt(0))!=5)
                
    {
                    
    if(word==null)
                        word
    =t;
                    
    else if(word.getEnd()==t.getStart())
                    
    {
                        word.setEnd(t.getEnd());
                        word.setWord(word.getWord()
    +t.getWord());
                    }

                    
    else
                    
    {
                        tokenlist.add(word);
                        word
    =t;
                    }

                }

                
    else if(word!=null)
                
    {
                    tokenlist.add(word);
                    word
    =null;
                    tokenlist.add(t);
                }

                
    else
                    tokenlist.add(t);
            }

            
    if(word!=null)
                tokenlist.add(word);
            
    return tokenlist;
        }

        
        
    //雙角轉(zhuǎn)單角
        public static int toAscii(int codePoint) 
        
    {
            
    if((codePoint>=65296 && codePoint<=65305)    //0-9
                    || (codePoint>=65313 && codePoint<=65338)    //A-Z
                    || (codePoint>=65345 && codePoint<=65370)    //a-z
                    )
            
    {    
                codePoint 
    -= 65248;
            }

            
    return codePoint;
        }

        
        
    //加載詞典
        public static void loadDictionary() 
        
    {  
            
    if (dictionary == null
            
    {    
                dictionary 
    = new HashMap<String, Integer>();    
                InputStream is 
    = null;    
                BufferedReader br 
    = null;            
                
    try
                
    {
                    is 
    = new FileInputStream(new File(MM2.class.getClassLoader().getResource("dictionary.txt").toURI()));
                    br 
    = new BufferedReader(new InputStreamReader(is, "UTF-8"));
                    String word 
    = null;
                    
    while ((word = br.readLine()) != null
                    
    {
                        word
    =word.toLowerCase();
                        
    if ((word.indexOf("#"== -1&& (word.length() <= WORD_MAX_LENGTH))
                        
    {
                            dictionary.put(word.intern(), 
    1);    
                            
    int i = word.length()-1
                            
    while(i >= 2)
                            
    {
                                String temp 
    = word.substring(0, i).intern(); 
                                
    if (!dictionary.containsKey(temp))
                                    dictionary.put(temp,
    2); 
                                i
    --;
                            }

                        }

                    }

                }

                
    catch (Exception e) 
                
    {      
                    log.info(e);
                }

                
    finally
                
    {
                    
    try 
                    
    {      
                        
    if(br!=null)
                            br.close();   
                        
    if(is!=null)
                            is.close();  
                    }

                    
    catch (IOException e)
                    
    {     
                        log.info(e);
                    }
                
                }
     
            }
     
        }

        
        
    public static String[] segWords(Reader input)
        
    {
            ArrayList
    <String> list=new ArrayList<String>();
            
    try
            
    {
                MM2 f
    =new MM2(input);
                ArrayList
    <Token> tlist= f.getNewToken(f.getToken(f.getSentence()));
                
    for(Token t:tlist)
                
    {
                    list.add(t.getWord());
                }

            }

            
    catch(IOException e)
            
    {
                log.info(e);
            }

            
    return (String[])list.toArray(new String[0]);
        }

        
        
    public static void main(String[] args) 
        
    {
            String[] cc
    =MM2.segWords(new StringReader("ibm商務(wù)機(jī)t60p".toLowerCase()));
            
    for(String c:cc)
            
    {
                System.out.println(c);
            }

        }

    }
    posted on 2011-08-04 15:31 nianzai 閱讀(3463) 評(píng)論(1)  編輯  收藏 所屬分類(lèi): 中文分詞

    評(píng)論:
    # re: 基于詞典的正向最大匹配中文分詞算法,能實(shí)現(xiàn)中英文數(shù)字混合分詞 2014-09-13 18:30 | 余道
    您好,您沒(méi)有給出Sentence和Token的定義,我猜不出啊

    hdwgz@qq.com  回復(fù)  更多評(píng)論
      
    主站蜘蛛池模板: 色九月亚洲综合网| 一级特黄录像免费播放肥| 国产无限免费观看黄网站| 96免费精品视频在线观看| 卡一卡二卡三在线入口免费| 国产成人精品亚洲精品| 亚洲天堂福利视频| 九一在线完整视频免费观看| 91福利视频免费观看| 无码专区一va亚洲v专区在线| 久久久久亚洲精品天堂| 白白色免费在线视频| 3344免费播放观看视频| 亚洲A∨精品一区二区三区| 亚洲午夜电影一区二区三区| 久久成人18免费网站| 最近中文字幕免费mv视频7| 亚洲AV无码久久精品蜜桃| 香蕉视频亚洲一级| 亚洲免费视频观看| 亚洲最大AV网站在线观看| 亚洲heyzo专区无码综合| 8888四色奇米在线观看免费看| 亚洲国产精品毛片av不卡在线| 最新亚洲春色Av无码专区| 免费精品一区二区三区第35| heyzo亚洲精品日韩| 亚洲欧洲另类春色校园网站| 在线播放免费人成毛片乱码| 亚洲成A∨人片天堂网无码| 亚洲www在线观看| 无码人妻精品中文字幕免费| 不卡精品国产_亚洲人成在线| 久久精品国产亚洲av天美18| 成年人视频免费在线观看| 亚洲AV综合色区无码一区爱AV| 免费一区二区无码视频在线播放 | 久久精品国产亚洲香蕉| 香蕉视频免费在线| 夫妻免费无码V看片| 亚洲成a人片在线观看播放|