• 大小: 18.23MB
    文件类型: .zip
    金币: 1
    下载: 0 次
    发布日期: 2023-07-04
  • 语言: Python
  • 标签:

资源简介

爬取百度百科中文页面,抽取三元组信息,构建中文知识图谱

资源截图

代码片段和文件信息

import re
from scrapy.selector import Selector

import pickle
import glob
from pathlib import Path
import ossys
import threading

print(‘loading pages‘)
pages=glob.glob(‘../webpages/*‘)
print(‘loading pages done.‘)
savepath=‘./paged.bin‘

print(len(pages))
print(pages[0])
paged=[]
if os.path.exists(savepath):
paged=pickle.load(open(savepath‘rb‘))
print(‘load state‘)
lock=threading.Lock()
fail_file=open(‘./fail_para.txt‘‘w‘)
class MyThread(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
self._running = True
def terminate(self):
self._running = False
def extract(selfpage):
#用Xpath提取出
中的所有内容
line=Selector(text=open(page‘r‘).read()).xpath(‘//div[contains(@class “main-content“)]‘)
title=line.xpath(‘//h1//text()‘).extract()
para=re.sub(‘\[[0-9]+\]‘ ‘‘ ‘‘.join(word for word in line.xpath(‘//div[contains(@class “para“)]//text()‘).extract() if len(word)>1))
# print(para)
print(‘process file:‘+str(title))
output = open(‘./info-para/‘+‘‘.join(title).replace(‘/‘‘‘)+‘.txt‘‘w‘)
output.write(para)
output.close()
def run(self):
try:
while len(pages)>0 and self.running:
lock.acquire()
page=pages[0]
pages.remove(page)
lock.release()
self.extract(page)
lock.acquire()
paged.append(page)
lock.release()
except Exception as e:
print(‘fail to extract..‘str(e))
fail_file.write(page)



list_thread=[]
try:
print(‘start...‘)
for i in range(12):
    list_thread.append(MyThread())
for th in list_thread:
    th.start()
    th.join()
except:
    for th in list_thread:
        th.terminate()
    print(‘error!‘ sys.exc_info()[0])
finally:
    print(‘save state‘)
    pickle.dump(paged open(‘paged.bin‘ ‘wb‘))
    fail_file.close()



 属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----
     目录           0  2019-06-10 02:25  WEB_KG-master\
     文件          40  2019-06-10 02:25  WEB_KG-master\.gitignore
     文件        1177  2019-06-10 02:25  WEB_KG-master\README.md
     目录           0  2019-06-10 02:25  WEB_KG-master\ie\
     文件        1788  2019-06-10 02:25  WEB_KG-master\ie\extract-para.py
     文件        2281  2019-06-10 02:25  WEB_KG-master\ie\extract-table.py
     目录           0  2019-06-10 02:25  WEB_KG-master\kg\
     文件        1083  2019-06-10 02:25  WEB_KG-master\kg\build-triple-from-table.py
     文件        1175  2019-06-10 02:25  WEB_KG-master\kg\insert_to_neo4j.py
     文件      397289  2019-06-10 02:25  WEB_KG-master\kg\kg.png
     文件    53091044  2019-06-10 02:25  WEB_KG-master\kg\triples.txt
     目录           0  2019-06-10 02:25  WEB_KG-master\spider\
     文件        1337  2019-06-10 02:25  WEB_KG-master\spider\html_downloader.py
     文件        2366  2019-06-10 02:25  WEB_KG-master\spider\html_parser.py
     文件        2189  2019-06-10 02:25  WEB_KG-master\spider\spider_main.py
     文件         648  2019-06-10 02:25  WEB_KG-master\spider\url_manager.py

评论

共有 条评论