资源简介

网络爬虫是一种能够自动采集互联网信息的程序。网络爬虫不但能够作为搜索引擎的采集器,而且可以用于特定信息的采集,根据某些特定的要求采集网站中的信息,如就业,租房信息等。本文设计并实现了一种基于主题的网络爬虫程序。网络爬虫采用何种搜索策略和如何评价当前页面的主题相关度是基于主题的网络爬虫需要解决的关键问题。本文设计的网络爬虫采用广度优先搜索,对url进行解析、去重等。并应用Java多线程,使爬虫在抓取网页的过程中更有效率。通常评价页面相关度是采用基于内容评价的搜索策略,本文实现了三个常用的相关度评价算法分别是基于网页内容的相关度算法、基于网页内容和标题的相关度算法、基于网页内容和链接结构的相关度算法。

资源截图

代码片段和文件信息

package theme;

import java.io.*;
import java.net.*;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.PriorityBlockingQueue;
import java.util.regex.*;
import javax.swing.JButton;
import javax.swing.JLabel;
import javax.swing.JOptionPane;
import javax.swing.JTextPane;
import javax.swing.text.BadLocationException;
import javax.swing.text.SimpleAttributeSet;
import javax.swing.text.styleConstants;
import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;

public class Crawler
{
private String title;
private volatile static int threadNum = 0;
private int urlCount = 1000;
private volatile int visitedURL = 0;
private int threadCount = 5;
private double threshold = 0.7;
private String startURL;

private HashMap keywords = new HashMap();

private PriorityBlockingQueue waitforHandling = new PriorityBlockingQueue();

private HashSet visited = new HashSet();

private HashMap wanted = new HashMap();

private HashSet noneRelevant = new HashSet();

private boolean stop = false;
 

private JTextPane textpane;
private JLabel label;
private JButton button;

ExecutorService threadPool = Executors.newCachedThreadPool();  

public String gettitle()
{
return title;
}

public void settitle(String title)
{
this.title = title;
}

public int getUrlCount()
{
return urlCount;
}

public void setUrlCount(int urlCount)
{
this.urlCount = urlCount;
}

public int getThreadCount()
{
return threadCount;
}

public Iterator getKeyWords() 
{
return keywords.keySet().iterator();
}

public void setThreshold(double threshold)
{
this.threshold = threshold;
}

public String getStartURL()
{
return startURL;
}

public void setStartURL(String startURL)
{
this.startURL = startURL;
}

public double getThreshold()
{
return threshold;
}

public void setThreadCount(int threadCount)
{
this.threadCount = threadCount;
}

public void addKeyWord(String word int count) 
{
keywords.put(word count);
}

public void removeKeyWord(String word)
{
if (word != null)
{
if (keywords.containsKey(word))
{
keywords.remove(word);
}
}
}

public void removeAllKeyWords()
{
keywords.clear();
}

public Crawler(String title String start JTextPane textpane JLabel labelJButton button)
{
this.title = title;
this.startURL = start;
this.textpane = textpane;
this.label = label;
this.butt

 属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----
     目录           0  2015-05-29 12:51  theme\
     文件         858  2015-04-30 10:18  theme\.classpath
     文件         381  2015-04-20 13:24  theme\.project
     目录           0  2015-04-20 13:24  theme\.settings\
     文件         598  2015-04-20 13:24  theme\.settings\org.eclipse.jdt.core.prefs
     目录           0  2015-05-26 22:43  theme\bin\
     目录           0  2015-05-26 22:43  theme\bin\theme\
     文件         738  2015-05-26 22:43  theme\bin\theme\Crawler$1.class
     文件         676  2015-05-26 22:43  theme\bin\theme\Crawler$2.class
     文件         906  2015-05-26 22:43  theme\bin\theme\Crawler$3.class
     文件         619  2015-05-26 22:43  theme\bin\theme\Crawler$Task.class
     文件       13408  2015-05-26 22:43  theme\bin\theme\Crawler.class
     文件       10776  2015-05-26 22:43  theme\bin\theme\Crawlerframe.class
     文件        8304  2015-05-26 22:43  theme\bin\theme\Download.class
     文件         817  2015-05-26 22:43  theme\bin\theme\HtmlParserTool$1.class
     文件        2703  2015-05-26 22:43  theme\bin\theme\HtmlParserTool.class
     文件         645  2015-05-26 22:43  theme\bin\theme\HttpConstants.class
     文件         148  2015-05-26 22:43  theme\bin\theme\linkFilter.class
     文件         873  2015-05-26 22:43  theme\bin\theme\PriorityURL.class
     文件           0  2015-04-29 15:12  theme\result
     目录           0  2015-04-20 13:24  theme\src\
     目录           0  2015-04-25 18:24  theme\src\theme\
     文件       11856  2015-05-06 12:21  theme\src\theme\Crawler.java
     文件       10473  2015-04-30 10:20  theme\src\theme\Crawlerframe.java
     文件        7113  2015-05-06 12:21  theme\src\theme\Download.java
     文件        1780  2015-05-06 12:21  theme\src\theme\HtmlParserTool.java
     文件         606  2015-04-20 15:27  theme\src\theme\HttpConstants.java
     文件         160  2015-05-03 15:33  theme\src\theme\linkFilter.java
     文件         505  2015-05-26 22:43  theme\src\theme\PriorityURL.java

评论

共有 条评论