• 大小: 5KB
    文件类型: .py
    金币: 1
    下载: 0 次
    发布日期: 2022-09-06
  • 语言: Python
  • 标签: python  

资源简介

python 实现股吧评论抓取及分析

资源截图

代码片段和文件信息

import re requests codecs time random jiebatushare
import jieba.analyse
from lxml import html

# proxies={“http“ : “123.53.86.133:61234“}
proxies = None
headers = {
    ‘Host‘: ‘guba.eastmoney.com‘
    ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X metaSr 1.0‘}

def get_url(stocknumpage):
    url = ‘http://guba.eastmoney.com/list‘ + str(stocknum) + ‘_‘ + str(page) + ‘.html‘
    try:
        text = requests.get(url headers=headers proxies=proxies timeout=20)
        requests.adapters.DEFAULT_RETRIES = 5
        s = requests.session()
        s.keep_alive = False
        text = html.fromstring(text.text)
        urls = text.xpath(‘//div[@id=“articlelistnew“]//div[@class=“articleh normal_post“]/span[3]/a/@href‘)
        # print(urls)
    except Exception as e:
        time.sleep(random.random() + random.randint(1 3))
        urls = ‘‘
    return urls


def get_comments(urls):
    for newurl in urls:
        newurl1 = ‘http://guba.eastmoney.com‘ + newurl
        # print(newurl1)
        try:
            text1 = requests.get(newurl1 headers=headers proxies=proxies timeout=20)
            requests.adapters.DEFAULT_RETRIES = 5
            s = requests.session()
            s.keep_alive = False
            text1 = html.fromstring(text1.text)
            times1 = text1.xpath(‘//div[@class=“zwfbtime“]/text()|//div[@class=“zwli clearfix“]/div[4]/div/div[2]/text()‘)
            times = ‘!‘.join(re.sub(re.compile(‘发表于| ‘) ‘‘ x)[:10] for x in times1).split(‘!‘)
            # print(times)
            # times=list(map(lambda x:re.sub(re.compile(‘发表于| ‘)‘‘x)[:10]times))
            comments1 = text1.xpath(‘//div[@class=“stockcodec .xeditor“]/text()|//div[@class=“zwli clearfix“]/div[4]/div/div[3]/div/text()‘)
            comments = ‘!‘.join(w.strip() for w in comments1).split(‘!‘)
            if comments == [‘‘]:
                continue
            else:
                dic = dict(zip(times comments))
                save_to_file(dic)
        except:
            print(‘error!!!!‘)
            time.sleep(random.random() + random.randint(0 3))

    # if times and comments:
        # dic.append({‘time‘:times‘comment‘:comments})
    # re

评论

共有 条评论