import requests
import time
import multiprocessing
import os
from lxml import etree
start = time.time()
def request(url):
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'BAIDUID=A451DB8F7AEFC4B615D60BC3E8858FFD:FG=1; BIDUPSID=A451DB8F7AEFC4B615D60BC3E8858FFD; PSTM=1525410326; __cfduid=df85f9c87f97a212b389ee23d5433e0d21525412886; BDUSS=BXUXk2b1daSlExdVg5OHUtamQ3R3d6T1d4c1Z6UFNZUUcyVzFuMG5pNTF-aVZiQVFBQUFBJCQAAAAAAAAAAAEAAACuYsUz0ru6rTM4AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAHVx%7Elp1cf5aMV; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; H_PS_PSSID=1452_21090_26350_20928; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; PSINO=2; Hm_lvt_55b574651fcae74b0a9f1cf9c8d7c93a=1526883395,1526965557,1527066562,1527131168; pgv_pvi=3767739392; pgv_si=s922310656; Hm_lpvt_55b574651fcae74b0a9f1cf9c8d7c93a=1527131552',
'Host': 'baike.baidu.com',
'Referer': 'https: //baike.baidu.com/ziran',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'
}
# print('Waiting for', url)
try:
result = requests.get(url,headers = headers,timeout = 5).content
# print('Get response from', url, 'Result:', result)
return result
except:
open('.txt','a').write(url+ '\n')
def parse(html,url):
# list= []
exactor_data = etree.HTML(html)
jie_shao = exactor_data.xpath("//div[@class = 'main-content']//div[@class = 'lemma-summary']/div[@class = 'para'][1]//text()") if len(exactor_data.xpath("//div[@class = 'main-content']//div[@class = 'lemma-summary']/div[@class = 'para'][1]//text()")) > 0 else None
if jie_shao == None:
jie_shao = exactor_data.xpath("//dd[@class = 'desc']/div[@class = 'lemma-summary']/div[1]//text()") if len(exactor_data.xpath( "//dd[@class = 'desc']/div[@class = 'lemma-summary']/div[1]//text()")) > 0 else None
if jie_shao == None:
jie_shao = exactor_data.xpath("//div[@class = 'poster-top']/div[contains(@class,'lemmaWgt-lemmaSummary-dark')]//text()") if len(
exactor_data.xpath("//div[@class = 'poster-top']/div[contains(@class,'lemmaWgt-lemmaSummary-dark')]//text()")) > 0 else None
description = exactor_data.xpath("//div[@class = 'main-content']//div[@class = 'lemma-summary']/div[@class = 'para'][2]//text()") if len(
exactor_data.xpath("//div[@class = 'main-content']//div[@class = 'lemma-summary']/div[@class = 'para'][2]//text()")) > 0 else None
if description == None:
description = exactor_data.xpath("//dd[@class = 'desc']/div[@class = 'lemma-summary']/div[2]//text()") if len(
exactor_data.xpath("//dd[@class = 'desc']/div[@class = 'lemma-summary']/div[2]//text()")) > 0 else None
if description == None:
description = exactor_data.xpath(
"//div[@class = 'main-content']//div[@class = 'para'][2]//text()") if len(
exactor_data.xpath("//div[@class = 'main-content']//div[@class = 'para'][2]//text()")) > 0 else None
print(url)
print(jie_shao)
print(description)
# 左侧的左侧
exactor_left_dt_list = exactor_data.xpath("//dl[contains(@class,'basicInfo-left')]//dt")
basicInfo_left_dt = ''
for exactor_left in exactor_left_dt_list:
text = exactor_left.xpath('.//text()') if len(exactor_left.xpath('.//text()'))> 0 else None
if text:
text_str = ('').join(text).replace(' ','')
basicInfo_left_dt += text_str + '&'
# print(basicInfo_left_dt)
# 左侧的右侧
exactor_left_dd_list = exactor_data.xpath("//dl[contains(@class,'basicInfo-left')]//dd")
basicInfo_left_dd = ''
for exactor_left in exactor_left_dd_list:
text = exactor_left.xpath('.//text()') if len(exactor_left.xpath('.//text()')) > 0 else None
# print(text)
text_str = ('').join(text).replace(' ','').strip().replace('\n', ',')
# print(text_str)
basicInfo_left_dd += text_str + '&'
# print(basicInfo_left_dd)
# 右侧的左侧
exactor_right_dt_list = exactor_data.xpath("//dl[contains(@class,'basicInfo-right')]//dt")
basicInfo_right_dt = ''
for exactor_right in exactor_right_dt_list:
text = exactor_right.xpath('.//text()') if len(exactor_right.xpath('.//text()')) > 0 else None
if text:
text_str = ('').join(text).replace(' ','')
basicInfo_right_dt += text_str + '&'
# print(basicInfo_right_dt)
# 右侧的右侧
exactor_right_dd_list = exactor_data.xpath("//dl[contains(@class,'basicInfo-right')]//dd")
basicInfo_right_dd = ''
for exactor_right in exactor_right_dd_list:
text = exactor_right.xpath('.//text()') if len(exactor_right.xpath('.//text()')) > 0 else None
# print(text)
text_str = ('').join(text).replace(' ', '').strip().replace('\n', ',')
# print(text_str)
basicInfo_right_dd += text_str + '&'
#
if jie_shao and basicInfo_left_dt and description:
data = '%s$%s$%s$%s$%s$%s'%(jie_shao,description,basicInfo_left_dt,basicInfo_left_dd,basicInfo_right_dt,basicInfo_right_dd)
return data
def write(data):
with open('ancient_people.txt','a')as f:
# print(url)
# print(data)
# print(type(data))
if data:
f.write(data + '\n')
def main(url):
html = request(url)
data = parse(html,url)
write(data)
if __name__ == '__main__':
# 获取计算机的核数
cpu_count = multiprocessing.cpu_count()
print('Cpu count:', cpu_count)
# 开启相应的进程数
pool = multiprocessing.Pool(cpu_count)
# 建立url列表
basic_path = '/Users/apple/PycharmProjects/baike_people/key_words/ji_ri'
list_file = os.listdir(basic_path)
for file in list_file:
if file.endswith('国际日.txt'):
path = basic_path + '/' + file
with open(path,'r')as f:
names = f.read()
print(names)
name_list = names.split(' ')
print(name_list)
url_list = set(['https://baike.baidu.com/item/{}'.format(name.replace('\n','')) for name in name_list if name and name != '\n'])
print(url_list)
print(len(url_list))
pool.map(main,url_list)
# end = time.time()
# print('Cost time:', end - start)