资源简介

利用python编写了一个爬虫代码,爬取房天下商品房信息,可以更改链接地址,爬取其他信息

资源截图

代码片段和文件信息

#!usr/bin/python
#-*-coding:utf-8-*-
#coding:gbk
from lxml import etree
import requests
import re
import numpy as np
import json
import sys
reload(sys)
sys.setdefaultencoding(‘utf-8‘)
def fangtianxia(url):
    head={‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/45.0.2454.101 Safari/537.36‘}
    html=requests.get(urlheaders=head).content.decode(‘gbk‘)
    selector=etree.HTML(html)
    content_field=selector.xpath(‘//div[@class=“nl_con clearfix“]/ul‘)[0]
    urlurl_lpurl_hx=[][][]
    file=open(‘fangtianxia.txt‘‘a‘)
    for each in content_field.xpath(‘li‘):
        website=each.xpath(‘div[1]/div[2]/div[1]/div[1]/a‘)[0].xpath(‘@href‘)[0]
        url.append(website)
        loupan=each.xpath(‘div[1]/div[2]/div[1]/div[1]/a/text()‘)[0].strip()
        try:
            region=each.xpath(‘div[1]/div[2]/div[3]/div[1]/a/span/text()‘)[0].replace(“]“““).replace(“[“““).strip()
        except Exceptione:
            print e
            region=np.nan
        try:
            address = each.xpath(‘div[1]/div[2]/div[3]/div[1]/a‘)[0].xpath(‘@title‘)[0]
        except Exceptione:
            address=np.nan
        try:
            price=each.xpath(‘div[1]/div[2]/div[5]/span/text()‘)[0]+each.xpath(‘div[1]/div[2]/div[5]/em/text()‘)[0]
        except Exceptione:
            print e
            price=np.nan
        try:
            phone=each.xpath(‘div[1]/div[2]/div[3]/div[2]/p/text()[1]‘)[0]+“转“ + each.xpath(‘div[1]/div[2]/div[3]/div[2]/p/text()[2]‘)[0]
        except Exception e:
            print e
            phone = np.nan
        print we

评论

共有 条评论