<rt id="bn8ez"></rt>
<label id="bn8ez"></label>

  • <span id="bn8ez"></span>

    <label id="bn8ez"><meter id="bn8ez"></meter></label>

    隨筆-23  評論-58  文章-0  trackbacks-0
    基于詞典的正向最大匹配中文分詞算法,能實現(xiàn)中英文數(shù)字混合分詞。比如能分出這樣的詞:bb霜、3室、樂phone、touch4、mp3、T恤

    第一次寫中文分詞程序,歡迎拍磚。

    查看第2版:正向最大匹配分詞程序,能實現(xiàn)中英文數(shù)字混合分詞 (第二版)

    public class MM2 
    {
        
    private static final Log log = LogFactory.getLog(MM2.class);
        
        
    private static HashMap<String, Integer> dictionary = null
        
    private static final int WORD_MAX_LENGTH = 9;
        
    private Reader reader;
        
        
    static
        
    {
            loadDictionary();
        }

        
        
    public MM2(Reader reader) 
        

            
    this.reader = reader; 
        }
     
        
        
    //切分出由中文、字母、數(shù)字組成的句子
        public ArrayList<Sentence> getSentence() throws IOException
        
    {   
            ArrayList
    <Sentence> list=new ArrayList<Sentence>();
            StringBuffer cb
    =new StringBuffer();
            
    int d=reader.read();
            
    int offset=0;
            
    boolean b=false;
            
    while(d>-1)
            
    {
                
    int type=Character.getType(d);
                
    if(type==2 || type==9 || type==5)
                
    {
                    d
    =toAscii(d);
                    cb.append((
    char)d);
                }

                
    else
                
    {
                    b
    =true;
                }

                d
    =reader.read();
                
    if(d==-1 || b)
                
    {
                    
    if(d==-1) offset++;
                    b
    =false;
                    
    char[] ioBuffer = new char[cb.length()];
                    cb.getChars(
    0, cb.length(), ioBuffer, 0);
                    Sentence sen
    =new Sentence(ioBuffer,offset-cb.length());
                    list.add(sen);
                    cb.setLength(
    0);
                }

                offset
    ++;
            }

            
    return list;
        }

        
        
    //將句子切分出詞
        public ArrayList<Token> getToken(ArrayList<Sentence> list) throws IOException
        
    {
            ArrayList
    <Token> tokenlist=new ArrayList<Token>();
            
    for(Sentence sen:list)
            
    {
                StringBuffer word 
    = new StringBuffer();
                
    int offset=sen.getStartOffset();
                
    int bufferIndex = 0;
                
    char c;
                
    boolean b=false;
                
    while(bufferIndex<sen.getText().length)
                
    {
                    offset
    ++;
                    c
    =sen.getText()[bufferIndex++];
                    
    if(word.length()==0)
                        word.append(c);
                    
    else
                    
    {
                        String temp 
    = (word.toString() + c).intern();
                        
    if(dictionary.containsKey(temp) && dictionary.get(temp)==1)
                            word.append(c);
                        
    else if(dictionary.containsKey(temp) && bufferIndex<sen.getText().length)
                            word.append(c);
                        
    else
                        
    {
                            bufferIndex
    --;
                            offset
    --;
                            
    while(word.length()>1 && dictionary.get(word.toString())!=null && dictionary.get(word.toString())==2)
                            
    {
                                word.deleteCharAt(word.length()
    -1);
                                bufferIndex
    --;
                                offset
    --;
                            }

                            b
    =true;
                        }

                    }

                    
    if(b || bufferIndex==sen.getText().length)
                    
    {
                        Token token 
    = new Token(word.toString(),offset-word.length(),offset,"word");
                        word.setLength(
    0);
                        tokenlist.add(token);
                        b
    =false;
                    }

                }

            }

            
    return tokenlist;
        }

        
        
    //將相連的單個英文或數(shù)字組合成詞
        public ArrayList<Token> getNewToken(ArrayList<Token> list) throws IOException
        
    {
            ArrayList
    <Token> tokenlist=new ArrayList<Token>();
            Token word
    =null;
            
    for(int i=0;i<list.size();i++)
            
    {
                Token t
    =list.get(i);
                
    if(t.getWord().length()==1 && Character.getType((int)t.getWord().charAt(0))!=5)
                
    {
                    
    if(word==null)
                        word
    =t;
                    
    else if(word.getEnd()==t.getStart())
                    
    {
                        word.setEnd(t.getEnd());
                        word.setWord(word.getWord()
    +t.getWord());
                    }

                    
    else
                    
    {
                        tokenlist.add(word);
                        word
    =t;
                    }

                }

                
    else if(word!=null)
                
    {
                    tokenlist.add(word);
                    word
    =null;
                    tokenlist.add(t);
                }

                
    else
                    tokenlist.add(t);
            }

            
    if(word!=null)
                tokenlist.add(word);
            
    return tokenlist;
        }

        
        
    //雙角轉(zhuǎn)單角
        public static int toAscii(int codePoint) 
        
    {
            
    if((codePoint>=65296 && codePoint<=65305)    //0-9
                    || (codePoint>=65313 && codePoint<=65338)    //A-Z
                    || (codePoint>=65345 && codePoint<=65370)    //a-z
                    )
            
    {    
                codePoint 
    -= 65248;
            }

            
    return codePoint;
        }

        
        
    //加載詞典
        public static void loadDictionary() 
        
    {  
            
    if (dictionary == null
            
    {    
                dictionary 
    = new HashMap<String, Integer>();    
                InputStream is 
    = null;    
                BufferedReader br 
    = null;            
                
    try
                
    {
                    is 
    = new FileInputStream(new File(MM2.class.getClassLoader().getResource("dictionary.txt").toURI()));
                    br 
    = new BufferedReader(new InputStreamReader(is, "UTF-8"));
                    String word 
    = null;
                    
    while ((word = br.readLine()) != null
                    
    {
                        word
    =word.toLowerCase();
                        
    if ((word.indexOf("#"== -1&& (word.length() <= WORD_MAX_LENGTH))
                        
    {
                            dictionary.put(word.intern(), 
    1);    
                            
    int i = word.length()-1
                            
    while(i >= 2)
                            
    {
                                String temp 
    = word.substring(0, i).intern(); 
                                
    if (!dictionary.containsKey(temp))
                                    dictionary.put(temp,
    2); 
                                i
    --;
                            }

                        }

                    }

                }

                
    catch (Exception e) 
                
    {      
                    log.info(e);
                }

                
    finally
                
    {
                    
    try 
                    
    {      
                        
    if(br!=null)
                            br.close();   
                        
    if(is!=null)
                            is.close();  
                    }

                    
    catch (IOException e)
                    
    {     
                        log.info(e);
                    }
                
                }
     
            }
     
        }

        
        
    public static String[] segWords(Reader input)
        
    {
            ArrayList
    <String> list=new ArrayList<String>();
            
    try
            
    {
                MM2 f
    =new MM2(input);
                ArrayList
    <Token> tlist= f.getNewToken(f.getToken(f.getSentence()));
                
    for(Token t:tlist)
                
    {
                    list.add(t.getWord());
                }

            }

            
    catch(IOException e)
            
    {
                log.info(e);
            }

            
    return (String[])list.toArray(new String[0]);
        }

        
        
    public static void main(String[] args) 
        
    {
            String[] cc
    =MM2.segWords(new StringReader("ibm商務機t60p".toLowerCase()));
            
    for(String c:cc)
            
    {
                System.out.println(c);
            }

        }

    }
    posted on 2011-08-04 15:31 nianzai 閱讀(3461) 評論(1)  編輯  收藏 所屬分類: 中文分詞

    評論:
    # re: 基于詞典的正向最大匹配中文分詞算法,能實現(xiàn)中英文數(shù)字混合分詞 2014-09-13 18:30 | 余道
    您好,您沒有給出Sentence和Token的定義,我猜不出啊

    hdwgz@qq.com  回復  更多評論
      
    主站蜘蛛池模板: 亚洲免费观看网站| 亚洲男女一区二区三区| 看成年女人免费午夜视频| 在线免费观看污网站| 亚洲欧好州第一的日产suv| 午夜色a大片在线观看免费| 亚洲国产高清国产拍精品| 国产最新凸凹视频免费| 美女无遮挡免费视频网站| 亚洲国产V高清在线观看| 一级a性色生活片久久无少妇一级婬片免费放| 四虎影视永久免费观看地址| 色www免费视频| 在线A亚洲老鸭窝天堂| 黄页免费在线观看| 亚洲国产精品综合一区在线| 在线免费观看一级毛片| 老司机午夜在线视频免费| 亚洲午夜福利AV一区二区无码| 国产色爽免费无码视频| 亚洲理论在线观看| 免费的一级片网站| 一级毛片aa高清免费观看| 91亚洲自偷手机在线观看| 嫩草视频在线免费观看| 一级毛片免费毛片毛片| 亚洲精品免费视频| 成年女人喷潮毛片免费播放| 精品国产福利尤物免费| 亚洲第一成年网站大全亚洲| 国产网站免费观看| 男人的天堂网免费网站| 亚洲mv国产精品mv日本mv| 亚洲国产成人久久综合一区77| 国产午夜无码精品免费看| 亚洲人片在线观看天堂无码| 伊人亚洲综合青草青草久热| 成人免费激情视频| 久久嫩草影院免费看夜色| 中文字幕在线观看亚洲视频| 国产亚洲人成A在线V网站|