• 大小: 70KB
    文件类型: .zip
    金币: 1
    下载: 0 次
    发布日期: 2021-05-28
  • 语言: 其他
  • 标签: 点赞数  

资源简介

微博数据爬取demo ,解析微博评论数,点赞数,图片链接等

资源截图

代码片段和文件信息

package top.kittygirl.wechat;


import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;

public class cpwsDataCrawler extends BreadthCrawler {

    /**
     * @param crawlPath crawlPath is the path of the directory which maintains
     *                  information of this crawler
     * @param autoParse if autoParse is trueBreadthCrawler will auto extract
     *                  links which match regex rules from pag
     */
    public cpwsDataCrawler(String crawlPath boolean autoParse) {
        super(crawlPath autoParse);
        /*start page*/
        //this.addSeed(“http://news.xidian.edu.cn/“);
       // http://wenshu.court.gov.cn/List/List?sorttype=1&conditions=searchWord+2+AJLX++%E6%A1%88%E4%BB%B6%E7%B1%BB%E5%9E%8B:%E6%B0%91%E4%BA%8B%E6%A1%88%E4%BB%B6
        /*fetch url like http://news.hfut.edu.cn/show-xxxxxxhtml*/
        //this.addRegex(“http://news.xidian.edu.cn/info/.*htm“);
        /*do not fetch jpg|png|gif*/
        //this.addRegex(“-.*\\.(jpg|png|gif).*“);
        /*do not fetch url contains #*/
       // this.addRegex(“-.*#.*“);
        this.addSeed(“http://wenshu.court.gov.cn/List/List?sorttype=1&conditions=searchWord+2+AJLX++%E6%A1%88%E4%BB%B6%E7%B1%BB%E5%9E%8B:%E6%B0%91%E4%BA%8B%E6%A1%88%E4%BB%B6“);
        setThreads(50);
        getConf().setTopN(1000);
        getConf().setExecuteInterval(100000);

    }


    public void visit(Page page CrawlDatums next) {
        String  a  = page.select(“#list“).select(“#resultList“).select(“#dataItem1“).text();

     //   String url = page.url();
   //     System.out.println(url);
        /*if page is news page*/
/*        if (page.matchUrl(“http://news.xidian.edu.cn/info/.*htm“)) {
            *//*extract title and content of news by css selector*//*
            String title = page.select(“div.neirong-bt“).text();
            String date = page.select(“span#date“).text();
            String clickNum = page.select(“div#wz_info.b_b“).first().child(3).select(“span“).first().child(0).select(“span“).val();
            String content = page.selectText(“div#artibody“);
            System.out.println(“URL:\n“ + url);
            System.out.println(“title:\n“ + title);
            System.out.println(“date:\n“ + date);
            System.out.println(“clickNum:\n“ + clickNum);
            System.out.println(“content:\n“ + content);
        }*/
    }

    public static void main(String[] args) throws Exception {
        cpwsDataCrawler crawler = new cpwsDataCrawler(“crawlllesZ“ true);
        /*start crawl with depth of 4*/
        crawler.start(1);
    }
}

 属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----
     目录           0  2019-03-31 09:14  weChatCrawler-master\
     目录           0  2019-03-31 09:14  weChatCrawler-master\.idea\
     文件         624  2019-03-01 14:40  weChatCrawler-master\.idea\compiler.xml
     文件         138  2019-03-01 14:40  weChatCrawler-master\.idea\encodings.xml
     目录           0  2019-03-31 09:14  weChatCrawler-master\.idea\fileTemplates\
     目录           0  2019-03-01 14:59  weChatCrawler-master\.idea\fileTemplates\code\
     目录           0  2019-03-01 14:59  weChatCrawler-master\.idea\fileTemplates\includes\
     目录           0  2019-03-01 14:59  weChatCrawler-master\.idea\fileTemplates\internal\
     目录           0  2019-03-01 14:59  weChatCrawler-master\.idea\fileTemplates\j2ee\
     目录           0  2019-03-31 09:14  weChatCrawler-master\.idea\libraries\
     文件         504  2019-03-16 14:13  weChatCrawler-master\.idea\libraries\Maven__cglib_cglib_nodep_3_2_4.xml
     文件         642  2019-03-16 14:13  weChatCrawler-master\.idea\libraries\Maven__cn_edu_hfut_dmic_webcollector_WebCollector_2_73_alpha.xml
     文件         543  2019-03-16 14:13  weChatCrawler-master\.idea\libraries\Maven__commons_codec_commons_codec_1_10.xml
     文件         503  2019-03-16 14:13  weChatCrawler-master\.idea\libraries\Maven__commons_io_commons_io_2_5.xml
     文件         558  2019-03-16 14:13  weChatCrawler-master\.idea\libraries\Maven__commons_logging_commons_logging_1_2.xml
     文件         514  2019-03-16 14:13  weChatCrawler-master\.idea\libraries\Maven__com_alibaba_fastjson_1_2_41.xml
     文件         564  2019-03-16 14:13  weChatCrawler-master\.idea\libraries\Maven__com_codeborne_phantomjsdriver_1_4_0.xml
     文件         654  2019-03-16 14:13  weChatCrawler-master\.idea\libraries\Maven__com_googlecode_juniversalchardet_juniversalchardet_1_0_3.xml
     文件         515  2019-03-16 14:13  weChatCrawler-master\.idea\libraries\Maven__com_google_code_gson_gson_2_8_0.xml
     文件         499  2019-03-16 14:13  weChatCrawler-master\.idea\libraries\Maven__com_google_guava_guava_21_0.xml
     文件         480  2019-03-16 14:13  weChatCrawler-master\.idea\libraries\Maven__com_sleepycat_je_5_0_73.xml
     文件         536  2019-03-16 14:13  weChatCrawler-master\.idea\libraries\Maven__com_squareup_okhttp3_okhttp_3_11_0.xml
     文件         510  2019-03-16 14:13  weChatCrawler-master\.idea\libraries\Maven__com_squareup_okio_okio_1_14_0.xml
     文件         578  2019-03-16 14:13  weChatCrawler-master\.idea\libraries\Maven__javax_servlet_javax_servlet_api_3_1_0.xml
     文件         455  2019-03-16 14:13  weChatCrawler-master\.idea\libraries\Maven__junit_junit_4_12.xml
     文件         469  2019-03-16 14:13  weChatCrawler-master\.idea\libraries\Maven__log4j_log4j_1_2_17.xml
     文件         574  2019-03-16 14:13  weChatCrawler-master\.idea\libraries\Maven__mysql_mysql_connector_java_5_1_31.xml
     文件         492  2019-03-16 14:13  weChatCrawler-master\.idea\libraries\Maven__net_java_dev_jna_jna_4_1_0.xml
     文件         555  2019-03-16 14:13  weChatCrawler-master\.idea\libraries\Maven__net_java_dev_jna_jna_platform_4_1_0.xml
     文件         498  2019-03-16 14:13  weChatCrawler-master\.idea\libraries\Maven__net_sf_opencsv_opencsv_2_3.xml
     文件         577  2019-03-16 14:13  weChatCrawler-master\.idea\libraries\Maven__net_sourceforge_cssparser_cssparser_0_9_22.xml
............此处省略74个文件信息

评论

共有 条评论

相关资源