<rt id="bn8ez"></rt>
<label id="bn8ez"></label>

  • <span id="bn8ez"></span>

    <label id="bn8ez"><meter id="bn8ez"></meter></label>

    中文JAVA技術平等自由協作創造

    Java專題文章博客和開源

    常用鏈接

    統計

    最新評論

    多線程實現的Java爬蟲程序

      以下是一個Java爬蟲程序,它能從指定主頁開始,按照指定的深度抓取該站點域名下的網頁并維護簡單索引。

      參數:private static int webDepth = 2;//爬蟲深度。

      主頁的深度為1,設置深度后超過該深度的網頁不會抓取。 private int intThreadNum = 10;//線程數。開啟的線程數。

      抓取時也會在程序源文件目錄下生成一個report.txt文件記錄爬蟲的運行情況,并在抓取結束后生成一個fileindex.txt文件維護網頁文件索引。

      本程序用到了多線程(靜態變量和同步),泛型,文件操作,URL類和連接,Hashtable類關聯數組,正則表達式及其相關類。托福答案

      運行時需使用命令行參數,第一個參數應使用http://開頭的有效URL字符串作為爬蟲的主頁,第二個參數(可選)應輸入可轉換為int型的字符串(用 Integer.parseInt(String s)靜態方法可以轉換的字符串,如3)作為爬蟲深度,如果沒有,則默認深度為2.

      本程序的不足之處是:只考慮了href= href=' href="后加絕對url的這三種情況(由于url地址在網頁源文件中情況比較復雜,有時處理也會出現錯誤),還有相對url和 window.open('的情況沒有考慮。異常處理程序也只是簡單處理。如果讀者有改進辦法可以把源代碼帖出,不勝感激。托福改分

      附上源代碼如下(保存名為GetWeb.java):

      import java.io.File;

      import java.io.BufferedReader;

      import java.io.FileOutputStream;

      import java.io.InputStream;

      import java.io.InputStreamReader;

      import java.io.OutputStreamWriter;

      import java.io.PrintWriter;

      import java.net.URL;

      import java.net.URLConnection;

      import java.util.ArrayList;

      import java.util.regex.Matcher;

      import java.util.regex.Pattern;

      import java.util.Hashtable;

      public class GetWeb {

      private int webDepth = 2;// 爬蟲深度

      private int intThreadNum = 10;// 線程數

      private String strHomePage = "";// 主頁地址

      private String myDomain;// 域名

      private String fPath = "web";// 儲存網頁文件的目錄名

      private ArrayList<String> arrUrls = new ArrayList<String>();// 存儲未處理URL

      private ArrayList<String> arrUrl = new ArrayList<String>();// 存儲所有URL供建立索引

      private Hashtable<String, Integer> allUrls = new Hashtable<String, Integer>();// 存儲所有URL的網頁號

      private Hashtable<String, Integer> deepUrls = new Hashtable<String, Integer>();// 存儲所有URL深度

      private int intWebIndex = 0;// 網頁對應文件下標,從0開始

      private String charset = "GB2312";

      private String report = "";

      private long startTime;

      private int webSuccessed = 0;

      private int webFailed = 0;

      public GetWeb(String s) {

      this.strHomePage = s;

      }

      public GetWeb(String s, int i) {

      this.strHomePage = s;

      this.webDepth = i;

      }

      public synchronized void addWebSuccessed() {

      webSuccessed++;

      }

      public synchronized void addWebFailed() {

      webFailed++;

      }

      public synchronized void addReport(String s) {

      try {

      report += s;

      PrintWriter pwReport = new PrintWriter(new FileOutputStream(

      "report.txt"));

      pwReport.println(report);

      pwReport.close();

      } catch (Exception e) {

      System.out.println("生成報告文件失敗!");

      }

      }

      public synchronized String getAUrl() {

      String tmpAUrl = arrUrls.get(0);

      arrUrls.remove(0);

      return tmpAUrl;

      }

      public synchronized String getUrl() {

      String tmpUrl = arrUrl.get(0);

      arrUrl.remove(0);

      return tmpUrl;

      }

      public synchronized Integer getIntWebIndex() {

      intWebIndex++;

      return intWebIndex;

      }

      /**

      * @param args

      */

      public static void main(String[] args) {

      if (args.length == 0 || args[0].equals("")) {

      System.out.println("No input!");

      System.exit(1);

      } else if (args.length == 1) {

      GetWeb gw = new GetWeb(args[0]);

      gw.getWebByHomePage();

      } else {

      GetWeb gw = new GetWeb(args[0], Integer.parseInt(args[1]));

      gw.getWebByHomePage();

      }

      }

      public void getWebByHomePage() {

      startTime = System.currentTimeMillis();

      this.myDomain = getDomain();

      if (myDomain == null) {

      System.out.println("Wrong input!");

      // System.exit(1);

      return;

      }

      System.out.println("Homepage = " + strHomePage);

      addReport("Homepage = " + strHomePage + "!\n");

      System.out.println("Domain = " + myDomain);

      addReport("Domain = " + myDomain + "!\n");

      arrUrls.add(strHomePage);

      arrUrl.add(strHomePage);

      allUrls.put(strHomePage, 0);

      deepUrls.put(strHomePage, 1);

      File fDir = new File(fPath);

      if (!fDir.exists()) {

      fDir.mkdir();

      }

      System.out.println("Start!");

      this.addReport("Start!\n");

      String tmp = getAUrl();

      this.getWebByUrl(tmp, charset, allUrls.get(tmp) + "");

      int i = 0;

      for (i = 0; i < intThreadNum; i++) {

      new Thread(new Processer(this))。start();

      }

      while (true) {

      if (arrUrls.isEmpty() && Thread.activeCount() == 1) {

      long finishTime = System.currentTimeMillis();

      long costTime = finishTime - startTime;

      System.out.println("\n\n\n\n\nFinished!");

      addReport("\n\n\n\n\nFinished!\n");

      System.out.println("Start time = " + startTime + " "

      + "Finish time = " + finishTime + " "

      + "Cost time = " + costTime + "ms");

      addReport("Start time = " + startTime + " "

      + "Finish time = " + finishTime + " "

      + "Cost time = " + costTime + "ms" + "\n");

      System.out.println("Total url number = "

      + (webSuccessed + webFailed) + " Successed: "

      + webSuccessed + " Failed: " + webFailed);

      addReport("Total url number = " + (webSuccessed + webFailed)

      + " Successed: " + webSuccessed + " Failed: "

      + webFailed + "\n");

      String strIndex = "";

      String tmpUrl = "";

      while (!arrUrl.isEmpty()) {

      tmpUrl = getUrl();

      strIndex += "Web depth:" + deepUrls.get(tmpUrl)

      + " Filepath: " + fPath + "/web"

      + allUrls.get(tmpUrl) + ".htm" + " url:" + tmpUrl

      + "\n\n";

      }

      System.out.println(strIndex);

      try {

      PrintWriter pwIndex = new PrintWriter(new FileOutputStream(

      "fileindex.txt"));

      pwIndex.println(strIndex);

      pwIndex.close();

      } catch (Exception e) {

      System.out.println("生成索引文件失敗!");

      }

      break;

      }

      }

      }

      public void getWebByUrl(String strUrl, String charset, String fileIndex) {

      try {

      // if(charset==null||"".equals(charset))charset="utf-8";

      System.out.println("Getting web by url: " + strUrl);

      addReport("Getting web by url: " + strUrl + "\n");

      URL url = new URL(strUrl);

      URLConnection conn = url.openConnection();

      conn.setDoOutput(true);

      InputStream is = null;

      is = url.openStream();

      String filePath = fPath + "/web" + fileIndex + ".htm";

      PrintWriter pw = null;

      FileOutputStream fos = new FileOutputStream(filePath);

      OutputStreamWriter writer = new OutputStreamWriter(fos);

      pw = new PrintWriter(writer);

      BufferedReader bReader = new BufferedReader(new InputStreamReader(

      is));

      StringBuffer sb = new StringBuffer();

      String rLine = null;

      String tmp_rLine = null;

      while ((rLine = bReader.readLine()) != null) {

      tmp_rLine = rLine;

      int str_len = tmp_rLine.length();

      if (str_len > 0) {

      sb.append("\n" + tmp_rLine);

      pw.println(tmp_rLine);

      pw.flush();

      if (deepUrls.get(strUrl) < webDepth)

      getUrlByString(tmp_rLine, strUrl);

      }

      tmp_rLine = null;

      }

      is.close();

      pw.close();

      System.out.println("Get web successfully! " + strUrl);

      addReport("Get web successfully! " + strUrl + "\n");

      addWebSuccessed();

      } catch (Exception e) {

      System.out.println("Get web failed! " + strUrl);

      addReport("Get web failed! " + strUrl + "\n");

      addWebFailed();

      }

      }

      public String getDomain() {

      String reg = "(?<=http\\://[a-zA-Z0-9]{0,100}[.]{0,1})[^.\\s]*?\\.(com|cn|net|org|biz|info|cc|tv)";

      Pattern p = Pattern.compile(reg, Pattern.CASE_INSENSITIVE);

      Matcher m = p.matcher(strHomePage);

      boolean blnp = m.find();

      if (blnp == true) {

      return m.group(0);

      }

      return null;

      }

      public void getUrlByString(String inputArgs, String strUrl) {

      String tmpStr = inputArgs;

      String regUrl = "(?<=(href=)[\"]?[\']?)[http://][^\\s\"\'\\?]*("

      + myDomain + ")[^\\s\"\'>]*";

      Pattern p = Pattern.compile(regUrl, Pattern.CASE_INSENSITIVE);

      Matcher m = p.matcher(tmpStr);

      boolean blnp = m.find();

      // int i = 0;

      while (blnp == true) {

      if (!allUrls.containsKey(m.group(0))) {

      System.out.println("Find a new url,depth:"

      + (deepUrls.get(strUrl) + 1) + " " + m.group(0));

      addReport("Find a new url,depth:" + (deepUrls.get(strUrl) + 1)

      + " " + m.group(0) + "\n");

      arrUrls.add(m.group(0));

      arrUrl.add(m.group(0));

      allUrls.put(m.group(0), getIntWebIndex());

      deepUrls.put(m.group(0), (deepUrls.get(strUrl) + 1));

      }

      tmpStr = tmpStr.substring(m.end(), tmpStr.length());

      m = p.matcher(tmpStr);

      blnp = m.find();

      }

      }

      class Processer implements Runnable {

      GetWeb gw;

      public Processer(GetWeb g) {

      this.gw = g;

      }

      public void run() {

      // Thread.sleep(5000);

      while (!arrUrls.isEmpty()) {

      String tmp = getAUrl();

      getWebByUrl(tmp, charset, allUrls.get(tmp) + "");

      }

      }

      }

      }

    posted on 2013-10-12 17:38 好不容易 閱讀(268) 評論(0)  編輯  收藏


    只有注冊用戶登錄后才能發表評論。


    網站導航:
     
    PK10開獎 PK10開獎
    主站蜘蛛池模板: 免费人成在线观看视频播放| h视频在线观看免费完整版| 日本一道在线日本一道高清不卡免费| 精品亚洲成a人片在线观看| 久久成人免费播放网站| 亚洲AV第一页国产精品| 美女视频黄的免费视频网页| 亚洲AV无码AV男人的天堂| 无码成A毛片免费| 亚洲第一香蕉视频| 亚洲成人免费网站| 亚洲性线免费观看视频成熟| 女性自慰aⅴ片高清免费| 无码天堂亚洲国产AV| 亚洲日本一区二区三区在线不卡| jizz日本免费| 亚洲国产精品无码专区影院| 免费A级毛片无码A∨| 狠狠色伊人亚洲综合网站色| 日本一道本高清免费| 99re6在线精品免费观看| 精品亚洲国产成AV人片传媒| 猫咪社区免费资源在线观看| 国产亚洲视频在线观看| 国产偷国产偷亚洲高清日韩| 久久精品成人免费网站| 亚洲一卡2卡3卡4卡国产网站| 国产不卡免费视频| 成人爽a毛片免费| 亚洲AV色吊丝无码| 亚洲色偷拍区另类无码专区| 久久国产乱子免费精品| 亚洲字幕AV一区二区三区四区| 亚洲国产中文v高清在线观看| 久久大香伊焦在人线免费| 亚洲一区AV无码少妇电影| 亚洲精品国产va在线观看蜜芽| 无码国产精品一区二区免费模式| 亚洲AV电影天堂男人的天堂| 亚洲男人的天堂www| 久久笫一福利免费导航|