• 大小: 11KB
    文件类型: .py
    金币: 1
    下载: 0 次
    发布日期: 2023-12-20
  • 语言: Python
  • 标签:

资源简介

https://github.com/helloMickey/project_previous/tree/master/judicial-data-analysis 爬取法律判决书的日期、年份、处理法院,并下载相应文书。 代码简单修改参数即可爬取不同的案件

资源截图

代码片段和文件信息

# coding:utf-8
import socket

socket.setdefaulttimeout(60)
import requests
import urllib2
# import cchardet
import os time
from lxml import etree
import threading
import re
import random
import sys
reload(sys)
sys.setdefaultencoding(‘utf-8‘)


# filenames=os.listdir(‘.‘)
# count=0
# for fname in filenames:
#  if fname.startswith(‘gid_log‘):
#  count+=1

# gid_path=‘gid_log_%d‘ %(count)

# 1、2步分开运行要注意gid_path
# gid_path=‘gid_log_12‘

def get_html(url):  # 得到网页源码
    headers = {
        “Accept-Language“: “zh-CNzh;q=0.8“
        “Accept-Encoding“: “gzip deflate sdch“
        “Accept“: “text/htmlapplication/xhtml+xmlapplication/xml;q=0.9image/webp*/*;q=0.8“
        “User-Agent“: “Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/45.0.2454.101 Safari/537.36“
        “Host“: “www.pkulaw.cn“
        “Cookie“: “bdyh_record=1970324860086081%2C1970324860087844%2C1970324860087837%2C1970324860087907%2C1970324860085114%2C1970324860087657%2C1970324860087697%2C1970324860087631%2C1970324860087701%2C1970324860087851%2C1970324860086614%2C1970324860000764%2C1970324845231811%2C1970324860004991%2C1970324860002384%2C1970324845231794%2C1970324845231624%2C1970324860002207%2C1970324860046814%2C1970324860046704%2C; CheckIPAuto=0; CheckIPDate=2016-10-15 10:03:46; gm3jc5afyl35gm2yt55kc4m1isIPlogin=1; ASP.NET_SessionId=davttbjhikxhqyn1lj5alhsb; Hm_lvt_58c470ff9657d300e66c7f33590e53a8=1476497011147649834814764985281476499578; Hm_lpvt_58c470ff9657d300e66c7f33590e53a8=1476499578; Hm_lvt_8266968662c086f34b2a3e2ae9014bf8=1476497011147649834814764985281476499578; Hm_lpvt_8266968662c086f34b2a3e2ae9014bf8=1476499578; CookieId=gm3jc5afyl35gm2yt55kc4m1; FWinCookie=1“
        “Upgrade-Insecure-Requests“: “1“
        “Proxy-Connection“: “keep-alive“
    }
    html = requests.get(url headers=headers).text
    return html


def write2file(content filename):  # 将爬取的文书写入文件保存
    try:
        f = open(filename ‘w‘)
    except Exception e:
        filename = filename.split(u‘、‘)[0] + ‘_error_filename.txt‘
        f = open(filename ‘w‘)
    f.write(content.encode(‘utf-8‘))
    f.close()

    # 下载ihref对应的文书


def load_one_wenshu(gid title):
    ex_href = ‘http://www.pkulaw.cn/case/FullText/_getFulltext?library=pfnl&gid=#gid#&loginSucc=0‘
    href = ex_href.replace(‘#gid#‘ gid)
    html = get_html(href)
    page = etree.HTML(html)
    content = page.xpath(‘body‘)[0].xpath(‘string(.)‘).strip()
    write2file(content filepath + os.sep + title + ‘.txt‘)


def load_one_page_wenshu(gid_list titles):  # 多线程抓取多个href的文书
    # threads=[]   # 尝试多线程加速 失败 访问频繁 出现验证码 封ip
    # for i in range(len(gid_list)):
    #  gidtitle=gid_list[i]titles[i]
    #  threads.append(threading.Thread(target=load_one_wenshuargs=(gidtitle)))
    # for t in threads:
    #  t.start()
    # t.join()  # 阻塞

    for i in range(len(gid_list)):  # 顺序爬取 时间过长 一个月大概需要20~30h
        load_one_wenshu(

评论

共有 条评论

相关资源