• 大小:
    文件类型: .rar
    金币: 1
    下载: 0 次
    发布日期: 2023-06-18
  • 语言: Python
  • 标签: python爬虫  

资源简介

用Python写网络爬虫PDF&源码

资源截图

代码片段和文件信息

# -*- coding: utf-8 -*-

import urllib2
import urlparse


def download1(url):
    “““Simple downloader“““
    return urllib2.urlopen(url).read()


def download2(url):
    “““Download function that catches errors“““
    print ‘Downloading:‘ url
    try:
        html = urllib2.urlopen(url).read()
    except urllib2.URLError as e:
        print ‘Download error:‘ e.reason
        html = None
    return html


def download3(url num_retries=2):
    “““Download function that also retries 5XX errors“““
    print ‘Downloading:‘ url
    try:
        html = urllib2.urlopen(url).read()
    except urllib2.URLError as e:
        print ‘Download error:‘ e.reason
        html = None
        if num_retries > 0:
            if hasattr(e ‘code‘) and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                html = download3(url num_retries-1)
    return html


def download4(url user_agent=‘wswp‘ num_retries=2):
    “““Download function that includes user agent support“““
    print ‘Downloading:‘ url
    headers = {‘User-agent‘: user_agent}
    request = urllib2.Request(url headers=headers)
    try:
        html = urllib2.urlopen(request).read()
    except urllib2.URLError as e:
        print ‘Download error:‘ e.reason
        html = None
        if num_retries > 0:
            if hasattr(e ‘code‘) and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                html = download4(url user_agent num_retries-1)
    return html


def download5(url user_agent=‘wswp‘ proxy=None num_retries=2):
    “““Download function with support for proxies“““
    print ‘Downloading:‘ url
    headers = {‘User-agent‘: user_agent}
    request = urllib2.Request(url headers=headers)
    opener = urllib2.build_opener()
    if proxy:
        proxy_params = {urlparse.urlparse(url).scheme: proxy}
        opener.add_handler(urllib2.ProxyHandler(proxy_params))
    try:
        html = opener.open(request).read()
    except urllib2.URLError as e:
        print ‘Download error:‘ e.reason
        html = None
        if num_retries > 0:
            if hasattr(e ‘code‘) and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                html = download5(url user_agent proxy num_retries-1)
    return html


download = download5


if __name__ == ‘__main__‘:
    print download(‘http://example.webscraping.com‘)

 属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----

     文件        174  2015-09-28 13:29  用Python写网络爬虫PDF&源码\用Python写爬虫-源码\.hg_archival.txt

     文件       2364  2015-09-28 13:29  用Python写网络爬虫PDF&源码\用Python写爬虫-源码\chapter01\common.py

     文件        553  2015-09-28 13:29  用Python写网络爬虫PDF&源码\用Python写爬虫-源码\chapter01\iteration_crawler1.py

     文件        846  2015-09-28 13:29  用Python写网络爬虫PDF&源码\用Python写爬虫-源码\chapter01\iteration_crawler2.py

     文件        931  2015-09-28 13:29  用Python写网络爬虫PDF&源码\用Python写爬虫-源码\chapter01\link_crawler1.py

     文件       1149  2015-09-28 13:29  用Python写网络爬虫PDF&源码\用Python写爬虫-源码\chapter01\link_crawler2.py

     文件       4649  2015-09-28 13:29  用Python写网络爬虫PDF&源码\用Python写爬虫-源码\chapter01\link_crawler3.py

     文件        445  2015-09-28 13:29  用Python写网络爬虫PDF&源码\用Python写爬虫-源码\chapter01\sitemap_crawler.py

     文件        554  2015-09-28 13:29  用Python写网络爬虫PDF&源码\用Python写爬虫-源码\chapter02\bs_example.py

     文件        462  2015-09-28 13:29  用Python写网络爬虫PDF&源码\用Python写爬虫-源码\chapter02\common.py

     文件       4816  2015-09-28 13:29  用Python写网络爬虫PDF&源码\用Python写爬虫-源码\chapter02\link_crawler.py

     文件        371  2015-09-28 13:29  用Python写网络爬虫PDF&源码\用Python写爬虫-源码\chapter02\lxml_example.py

     文件       2293  2015-09-28 13:29  用Python写网络爬虫PDF&源码\用Python写爬虫-源码\chapter02\performance.py

     文件        333  2015-09-28 13:29  用Python写网络爬虫PDF&源码\用Python写爬虫-源码\chapter02\regex_example.py

     文件        700  2015-09-28 13:29  用Python写网络爬虫PDF&源码\用Python写爬虫-源码\chapter02\scrape_callback1.py

     文件        940  2015-09-28 13:29  用Python写网络爬虫PDF&源码\用Python写爬虫-源码\chapter02\scrape_callback2.py

     文件       3686  2015-09-28 13:29  用Python写网络爬虫PDF&源码\用Python写爬虫-源码\chapter03\disk_cache.py

     文件       3230  2015-09-28 13:29  用Python写网络爬虫PDF&源码\用Python写爬虫-源码\chapter03\downloader.py

     文件       3183  2015-09-28 13:29  用Python写网络爬虫PDF&源码\用Python写爬虫-源码\chapter03\link_crawler.py

     文件       2356  2015-09-28 13:29  用Python写网络爬虫PDF&源码\用Python写爬虫-源码\chapter03\mongo_cache.py

     文件        818  2015-09-28 13:29  用Python写网络爬虫PDF&源码\用Python写爬虫-源码\chapter04\alexa_cb.py

     文件        564  2015-09-28 13:29  用Python写网络爬虫PDF&源码\用Python写爬虫-源码\chapter04\alexa_fn.py

     文件       3026  2015-09-28 13:29  用Python写网络爬虫PDF&源码\用Python写爬虫-源码\chapter04\mongo_queue.py

     文件       2736  2015-09-28 13:29  用Python写网络爬虫PDF&源码\用Python写爬虫-源码\chapter04\process_crawler.py

     文件        471  2015-09-28 13:29  用Python写网络爬虫PDF&源码\用Python写爬虫-源码\chapter04\process_test.py

     文件        375  2015-09-28 13:29  用Python写网络爬虫PDF&源码\用Python写爬虫-源码\chapter04\sequential_test.py

     文件       2491  2015-09-28 13:29  用Python写网络爬虫PDF&源码\用Python写爬虫-源码\chapter04\threaded_crawler.py

     文件        475  2015-09-28 13:29  用Python写网络爬虫PDF&源码\用Python写爬虫-源码\chapter04\threaded_test.py

     文件       2747  2015-09-28 13:29  用Python写网络爬虫PDF&源码\用Python写爬虫-源码\chapter05\browser_render.py

     文件       1101  2015-09-28 13:29  用Python写网络爬虫PDF&源码\用Python写爬虫-源码\chapter05\search1.py

............此处省略156个文件信息

评论

共有 条评论