python 爬虫的程序

大小: 5KB

文件类型: .py

金币: 1

下载: 1 次

发布日期: 2021-06-12
语言: Python
标签: python

高速下载

资源简介

师兄写的python 爬虫的程序，师兄写的python 爬虫的程序

资源截图

小图大图

代码片段和文件信息

# -*- coding: cp936 -*-

import urllib
import urllib2
import httplib
import threading
import re
import string
import time
import os
import StringIO
import gzip
from urllib2 import URLErrorHTTPError
from httplib import BadStatusLine

class spider（threading.Thread）:
    def __init__（selfthreadnameresultstartpagebasekeywordpathmode=0）:
        threading.Thread.__init__（selfname=threadname）
        self.result=result
        self.startpage=startpage
        self.base=base
        self.keyword=keyword
        self.path=path
        self.mode=mode    #爬取的类型：商品 公司
    def run（self）:
        starttime=time.clock（）
        retry=0     #设置连接次数
        page=self.startpage
        while 1:
            if（page>70）:    #公司类
                endtime=time.clock（）
                print self.getName（）+‘thread finish total time:%d‘%（endtime-starttime）
                #print self.result
                print ‘**********%d‘%len（self.result）
                break
            if（page>10 and self.mode==1）:    #商品类
                endtime=time.clock（）
                print self.getName（）+‘thread finish total time:%d‘%（endtime-starttime）
                #print self.result
                print ‘**********%d‘%len（self.result）
                break
                
            try:
                url=“http://www.baidu.com/s?wd=“+urllib.quote（self.keyword）+“&pn=“+str（page*10）
                #url=“http://www.google.com.hk/search?q=“+urllib.quote（self.keyword）+‘&hl=zh-CN&newwindow=1&safe=strict&biw=1199&bih=654&prmd=ivnscm&ei=HMOCTeWyDo_RcfjR_ZkD&start=‘+str（page*10）
                req=urllib2.Request（url）
                req.add_header（“User-Agent“‘Mozilla/5.0 （X11; U; Linux i686; zh-CN; rv:1.9.0.5） Gecko/2008121622 Ubuntu/8.10 （intrepid） Firefox/3.0.5‘）
                response=urllib2.urlopen（req）
                html=response.read（）
                self.searchResult（htmlself.mode）
                page=page+self.base
                
            except HTTPErrore:
                print “the server can‘t fullfill the request.\n“
                print “Error code:“e.code
                page=page+self.base
                
            except URLErrore:
                if（retry>2）:
                    print “can‘t open the %d page url“%page
                    print “URLError:“e.reason
                    page=page+self.base 
                    retry=0
                else:
                    print “retry connnect to:%d page“%page

上一篇：Python Django实现简单购物车功能
下一篇：安装iverlog和gtkwave

共有条评论

python 爬虫的程序

资源简介

资源截图

代码片段和文件信息

评论

相关资源