• 大小: 5.28MB
    文件类型: .zip
    金币: 1
    下载: 0 次
    发布日期: 2023-10-04
  • 语言: Python
  • 标签:

资源简介

网站图片爬虫(已包含:微博,微信公众号,花瓣网)及免费IP代理 豆瓣电影爬虫

资源截图

代码片段和文件信息

#encoding:utf-8

import requests
import json
import ossystime
from lxml import etree
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
from bs4 import BeautifulSoup
import re
reload(sys)
sys.setdefaultencoding(“utf-8“)

LANGUAGES_RE = re.compile(ur“语言: (.+?)
“)
COUNTRIES_RE = re.compile(ur“制片国家/地区: (.+?)
“)
ALTERNATE_NAME_RE = re.compile(ur“又名: (.+?)
“)
RELEASE_TIME_RE = re.compile(ur“上映日期: (.+?)
“)
NUM_RE = re.compile(r“(\d+)“)

data_save_file = “douban_donghua_results.txt“
headers = {
‘Accept‘:‘*/*‘
‘Accept-Encoding‘:‘gzip deflate br‘
‘Accept-Language‘:‘zh-CNzh;q=0.8en;q=0.6‘
‘Connection‘:‘keep-alive‘
‘Host‘:‘movie.douban.com‘
‘Referer‘:‘https://movie.douban.com/explore‘
‘User-Agent‘:‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML like Gecko) Chrome/59.0.3071.115 Safari/537.36‘
‘X-Requested-With‘:‘xmlHttpRequest‘
}

def get_item_list(d_urld_typed_tagd_sortd_page_limitd_page_start):
params = {}
params[“type“] = d_type
params[“tag“] = d_tag
if d_sort != ““:
params[“sort“] = d_sort
params[“page_limit“] = d_page_limit
params[“page_start“] = d_page_start
response = requests.get(d_urlheaders = headersparams = paramstimeout =10)
json_obj = response.json()
json_array = json_obj[“subjects“]
return json_array

def get_item_list_from_newsearch(d_urld_sortd_ranged_tagd_page_start):
params = {}
params[“sort“] = d_sort
params[“tags“] = d_tag
params[“range“] = d_range
params[“start“] = d_page_start
response = requests.get(d_urlheaders = headersparams = paramstimeout =10)
json_obj = response.json()
json_array = json_obj[“data“]
return json_array
def get_item_detail(item_detail_url):
result_obj = {}
result_obj[“subject_id“] = int(item_detail_url.split(“/“)[-2])
celebrities_url = “https://movie.douban.com/subject/“+str(result_obj[“subject_id“])+“/celebrities“
(directors_cn_namesdirectors_en_namesactors_cn_namesactors_en_names)=get_directors_and_actors(celebrities_url)
result_obj[“directors_cn_names“] = directors_cn_names
result_obj[“directors_en_names“] = directors_en_names
result_obj[“actors_cn_names“] = actors_cn_names
result_obj[“actors_en_names“] = actors_en_names
response = requests.get(item_detail_urlheaders = headerstimeout = 10)
selector = etree.HTML(response.text)
s_response = HtmlResponse(url=item_detail_urlbody = response.textencoding=‘utf-8‘)

name = s_response.selector.xpath(“//title/text()“).extract()
if name: result_obj[“movie_name“] = name[0].replace(u“ (豆瓣)“ ““).strip()

genres = s_response.selector.xpath(“//span[@property=‘v:genre‘]/text()“).extract()
if genres: result_obj[“genres“] = genres

S = ““.join(s_response.selector.xpath(“//div[@id=‘info‘]“).extract())

M = COUNTRIES_RE.search(S)
if M is not None:
result_obj[“countries“] = [country.strip() for country in M.group(1).split(“/“)]

L = LANGUAGES_RE.search(S)
if L is not None:
result_obj[“languages“] = [ lang.st

 属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----
     目录           0  2017-10-09 07:55  image_crawler-master\
     目录           0  2017-10-09 07:55  image_crawler-master\DoubanMovie\
     文件        6307  2017-10-09 07:55  image_crawler-master\DoubanMovie\movie_crawler.py
     文件        1850  2017-10-09 07:55  image_crawler-master\DoubanMovie\write_to_mysql.py
     目录           0  2017-10-09 07:55  image_crawler-master\Huaban\
     文件        4257  2017-10-09 07:55  image_crawler-master\Huaban\explain.md
     文件        4719  2017-10-09 07:55  image_crawler-master\Huaban\huaban_crawler.py
     文件     1437852  2017-10-09 07:55  image_crawler-master\Huaban\huaban_travel_places_result.txt
     目录           0  2017-10-09 07:55  image_crawler-master\IpProxy\
     目录           0  2017-10-09 07:55  image_crawler-master\IpProxy\Ip181FreeProxy\
     文件        1086  2017-10-09 07:55  image_crawler-master\IpProxy\Ip181FreeProxy\get_ip181.py
     目录           0  2017-10-09 07:55  image_crawler-master\IpProxy\KuaiFreeProxy\
     文件        1088  2017-10-09 07:55  image_crawler-master\IpProxy\KuaiFreeProxy\get_kuaifreeproxy.py
     目录           0  2017-10-09 07:55  image_crawler-master\IpProxy\XunFreeProxy\
     文件        1155  2017-10-09 07:55  image_crawler-master\IpProxy\XunFreeProxy\get_xunfreeproxy.py
     文件         714  2017-10-09 07:55  image_crawler-master\README.md
     目录           0  2017-10-09 07:55  image_crawler-master\SinaWeibo\
     文件    10883820  2017-10-09 07:55  image_crawler-master\SinaWeibo\chromedriver
     文件       30151  2017-10-09 07:55  image_crawler-master\SinaWeibo\image_result.md
     文件        8873  2017-10-09 07:55  image_crawler-master\SinaWeibo\weibo_crawler.py
     文件        5080  2017-10-09 07:55  image_crawler-master\SinaWeibo\weibo_hot_topic_crawler.py
     目录           0  2017-10-09 07:55  image_crawler-master\WechatOfficialAccounts\
     文件        2333  2017-10-09 07:55  image_crawler-master\WechatOfficialAccounts\spider_wechat_official_accounts.py

评论

共有 条评论