资源简介

python 抓取1688店铺产品详情,爬虫

资源截图

代码片段和文件信息

#coding=utf8  
import urllib2  
import re  
import MySQLdb  
import requests
from compiler.pycodegen import EXCEPT
import random
import time
import datetime
from lxml import etree
import sys
reload(sys)
sys.setdefaultencoding(‘utf8‘)

#程序进行前准备:要先创建个数据库将1688database名字的数据库,然后创建好urltable表,用sqlyong工具将店铺网址都放到urltable表里面去
#将urltable表的url店铺地址存储到url_list列表中
# url的格式:https://shop1470760677060.1688.com/page/offerlist.htm?spm=a261y.7663282.0.0.Su0VBS
def get_url_list():
    url_list=[] 
    get_url_sql = ‘SELECT url FROM urltable;‘
    count  = cur.execute(get_url_sql) 
    print u‘ 有  %s 个店铺地址  ‘ % count
    urlresults = cur.fetchall()  
    result=list(urlresults) 
    for url in result:
        print url[0]
        url_list.append(url[0])
    return url_list

def get_all_goods_url(page):
    begin = datetime.datetime.now()
    
    # 如果出现异常,尝试次数5次,还是错误,则判断,页码超出范围,停止采集。
    page = page
    print u‘.................第%s页...........‘ %page
    count = 0
    conut_net = 0
   
    DD = True
    while DD:
        print ‘conut_net-->‘conut_net
        try:
            proxyHost = “proxy.abuyun.com“
            proxyPort = “9020“
            proxyUser = “H4073W6H9EJ29Z4D“
            proxyPass = “32D3D1294745B2B2“
            proxymeta = “http://%(user)s:%(pass)s@%(host)s:%(port)s“ % {
                  “host“ : proxyHost
                  “port“ : proxyPort
                  “user“ : proxyUser
                  “pass“ : proxyPass
            }
            proxies = {
                    “http“  : proxymeta
                    “https“ : proxymeta
            }
            headers = {
                #‘:authority‘:‘bertoys.1688.com‘
                #‘:method‘:‘GET‘
                #‘:path‘:‘/page/offerlist.htm?spm=a2615.7691456.0.0.0MNLge&tradenumFilter=false&sampleFilter=false&mixFilter=false&privateFilter=false&mobileOfferFilter=%24mobileOfferFilter&groupFilter=false&sortType=tradenumdown&pageNum=6‘
                #‘:scheme‘:‘https‘
                ‘accept‘:‘text/htmlapplication/xhtml+xmlapplication/xml;q=0.9image/webp*/*;q=0.8‘
                ‘accept-encoding‘:‘gzip deflate sdch br‘
                ‘accept-language‘:‘zh-CNzh;q=0.8‘
                ‘referer‘:url
                ‘upgrade-insecure-requests‘:‘1‘
                ‘user-agent‘:useragent
                            }  
                
            shop_url = url.split(‘?‘)[0]
            spm = url.split(‘?‘)[1]
                  
            parameter = {
                    ‘spm‘:spm
                    ‘tradenumFilter‘:‘false‘
                    ‘sampleFilter‘:‘false‘
                    ‘mixFilter‘:‘false‘
                    ‘privateFilter‘:‘false‘
                    ‘mobileOfferFilter‘:‘$mobileOfferFilter‘
                    ‘groupFilter‘:‘false‘
                    ‘sortType‘:‘tradenumdown‘
                    ‘pageNum‘:str(page)
                }
            
            #测试

评论

共有 条评论