• 大小: 490KB
    文件类型: .rar
    金币: 1
    下载: 0 次
    发布日期: 2021-08-27
  • 语言: 其他
  • 标签: Python  Scrapy  词云  

资源简介

个人自主研制爬虫策略,成功绕过阿里云反爬机制,天猫、淘宝都不在话下!外增词云图绘制代码,带你爬虫、带你数据分析、带你飞!

资源截图

代码片段和文件信息

# -*- coding: utf-8 -*-

import urllib.request
import json
import time
import re


def find_message(url x j):
    print(‘已有‘ + str(x) + ‘页无法获取‘)
    html = urllib.request.urlopen(url).read().decode(‘gbk‘)
    jsondata = re.search(‘^[^(]*?\((.*)\)[^)]*$‘ html).group(1)
    data = json.loads(jsondata)
    try:
        for i in range(len(data[‘rateDetail‘][‘rateList‘])):
            name = data[‘rateDetail‘][‘rateList‘][i][‘displayUserNick‘]
            content = data[‘rateDetail‘][‘rateList‘][i][‘rateContent‘]
            comment_time = data[‘rateDetail‘][‘rateList‘][i][‘rateDate‘]
            category = data[‘rateDetail‘][‘rateList‘][i][‘auctionSku‘]
            append_days = data[‘rateDetail‘][‘rateList‘][i][‘appendComment‘][‘days‘]
            append_time = data[‘rateDetail‘][‘rateList‘][i][‘appendComment‘][‘commentTime‘]
            append_content = data[‘rateDetail‘][‘rateList‘][i][‘appendComment‘][‘content‘]
            write(‘comment.csv‘ (name content comment_time category append_days append_time append_content))
        print(‘第‘ + str(j + 1) + ‘页数据已经获取成功‘)
        return x
    except baseException:
        x += 1
        print(‘已有‘ + str(x) + ‘页无法获取‘)
        print(‘======此页无法获取======‘)
        return x


def write(path text):
    with open(path ‘a‘ encoding=‘utf-8‘) as f:
        f.writelines(text)
        f.write(‘\n‘)


if __name__ == ‘__main__‘:
    x = 0
    for j in range(1 51):
        try:
            print(‘正在获取第{}页评论数据‘.format(j))
            url = ‘https://rate.tmall.com/list_detail_rate.htm?itemId=537259015354&spuId=694941313&sellerId=2386968451‘ \
                  ‘&order=1¤tPage={}&append=0&content=1&tagId=&posi=&picture=&groupId=&‘ \
                  ‘ua=098%23E1hv0pvxvchvUvCkvvvvvjiPRFsOljlnP2F9tjYHPmP9ljnHnLqv6j3URFLhAj1U9phvHHiaLxF3zHi4w17gtssR‘ \
                  ‘7TC4NrGBdphv219vhQ9wjVoKzYVtRkHL6OhCvv14cGJOEa1475PE7r%2FCvpvW7D%2FShUbw7Dis%2BtjN9phv2HiNsQ9bzHi4‘ \
                  ‘wTo2zsyCvvpvvvvvkphvC9QvvOC0p4yCvv9vvUmljyONNbyCvmFMMQ2GS6vvtQvvvQCvpvoKvvv2vhCv2UhvvvWvphvWgvvvvQ‘ \
                  ‘avpvQXmphvLv3fYpvjcRCldU9tK7ERiNLyzCyXfCuYiXVvVE6Fp%2B0x9W9OjLEc6acEKBm6NB3rQjcQ%2BulgEfk1DfesRk9c‘ \
                  ‘znsW1C0OwZFvgb2XrqpCvpvVvmvvvhCv2QhvCvvvMMGtvpvhvvvvv8wCvvpvvUmm3QhvCvvhvvmCvpvW7D%2FjM0Lw7Di4XLLN‘ \
                  ‘dphvmpvhYUWOVvCpjOhCvCB47Twpc1147DiAiKNG%2FHrz7IbNVLyCvvpvvvvvdphvmpvZL9nEop2nULyCvvpvvvvv‘ \
                  ‘&needFold=0&_ksTS=1560657691879_1614&callback=jsonp1615‘.format(str(j))
            x = find_message(url x j)
            time.sleep(3)
        except baseException:
            continue

 属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----

     文件        298  2019-06-24 16:11  tmall\.idea\misc.xml

     文件        269  2019-06-16 11:55  tmall\.idea\modules.xml

     文件        497  2019-06-17 16:11  tmall\.idea\tmall.iml

     文件      37254  2019-06-25 12:19  tmall\.idea\workspace.xml

     文件     319401  2019-06-24 18:52  tmall\bucket.jpg

     文件      28657  2019-06-24 21:43  tmall\capstone.html

     文件       2767  2019-06-17 15:52  tmall\comment_selenium.py

     文件      79800  2019-06-22 00:51  tmall\kuaidaili.csv

     文件        253  2019-06-16 11:50  tmall\scrapy.cfg

     文件       4329  2019-06-19 23:19  tmall\selenium_tmall.py

     文件     408915  2019-06-23 11:26  tmall\TeeMall.csv

     文件         59  2019-06-24 18:46  tmall\test.py

     文件          0  2019-06-20 15:01  tmall\text.json

     文件        661  2019-06-20 03:43  tmall\tmall\items.py

     文件      79800  2019-06-22 00:51  tmall\tmall\kuaidaili.csv

     文件       4512  2019-06-23 00:23  tmall\tmall\middlewares.py

     文件        286  2019-06-16 11:50  tmall\tmall\pipelines.py

     文件       3149  2019-06-23 11:23  tmall\tmall\settings.py

     文件      30392  2019-06-18 14:58  tmall\tmall\spiders\geckodriver.log

     文件       9273  2019-06-23 00:29  tmall\tmall\spiders\jisu.py

     文件        161  2019-03-07 10:26  tmall\tmall\spiders\__init__.py

     文件       4079  2019-06-23 00:29  tmall\tmall\spiders\__pycache__\jisu.cpython-36.pyc

     文件        127  2019-06-16 11:52  tmall\tmall\spiders\__pycache__\__init__.cpython-36.pyc

     文件      30427  2019-06-20 03:15  tmall\tmall\test.py

     文件       5092  2019-06-17 01:01  tmall\tmall\utils.py

     文件          0  2019-03-07 10:26  tmall\tmall\__init__.py

     文件        605  2019-06-20 03:45  tmall\tmall\__pycache__\items.cpython-36.pyc

     文件       3818  2019-06-23 00:26  tmall\tmall\__pycache__\middlewares.cpython-36.pyc

     文件        446  2019-06-17 11:14  tmall\tmall\__pycache__\pipelines.cpython-36.pyc

     文件        658  2019-06-23 11:23  tmall\tmall\__pycache__\settings.cpython-36.pyc

............此处省略16个文件信息

评论

共有 条评论