• 大小: 5KB
    文件类型: .py
    金币: 2
    下载: 1 次
    发布日期: 2021-06-12
  • 语言: Python
  • 标签: python  

资源简介

师兄写的python 爬虫的程序,师兄写的python 爬虫的程序

资源截图

代码片段和文件信息

# -*- coding: cp936 -*-

import urllib
import urllib2
import httplib
import threading
import re
import string
import time
import os
import StringIO
import gzip
from urllib2 import URLErrorHTTPError
from httplib import BadStatusLine

class spider(threading.Thread):
    def __init__(selfthreadnameresultstartpagebasekeywordpathmode=0):
        threading.Thread.__init__(selfname=threadname)
        self.result=result
        self.startpage=startpage
        self.base=base
        self.keyword=keyword
        self.path=path
        self.mode=mode    #爬取的类型:商品 公司
    def run(self):
        starttime=time.clock()
        retry=0     #设置连接次数
        page=self.startpage
        while 1:
            if(page>70):    #公司类
                endtime=time.clock()
                print self.getName()+‘thread finish total time:%d‘%(endtime-starttime)
                #print self.result
                print ‘**********%d‘%len(self.result)
                break
            if(page>10 and self.mode==1):    #商品类
                endtime=time.clock()
                print self.getName()+‘thread finish total time:%d‘%(endtime-starttime)
                #print self.result
                print ‘**********%d‘%len(self.result)
                break
                
            try:
                url=“http://www.baidu.com/s?wd=“+urllib.quote(self.keyword)+“&pn=“+str(page*10)
                #url=“http://www.google.com.hk/search?q=“+urllib.quote(self.keyword)+‘&hl=zh-CN&newwindow=1&safe=strict&biw=1199&bih=654&prmd=ivnscm&ei=HMOCTeWyDo_RcfjR_ZkD&start=‘+str(page*10)
                req=urllib2.Request(url)
                req.add_header(“User-Agent“‘Mozilla/5.0 (X11; U; Linux i686; zh-CN; rv:1.9.0.5) Gecko/2008121622 Ubuntu/8.10 (intrepid) Firefox/3.0.5‘)
                response=urllib2.urlopen(req)
                html=response.read()
                self.searchResult(htmlself.mode)
                page=page+self.base
                
            except HTTPErrore:
                print “the server can‘t fullfill the request.\n“
                print “Error code:“e.code
                page=page+self.base
                
            except URLErrore:
                if(retry>2):
                    print “can‘t open the %d page url“%page
                    print “URLError:“e.reason
                    page=page+self.base 
                    retry=0
                else:
                    print “retry connnect to:%d page“%page
       

评论

共有 条评论