import requests
from lxml import etree
import os
from concurrent.futures import ThreadPoolExecutor
# 爬取页面
url = "http://www.xfuedu.org/bxwx/9613/"
os.mkdir("武动乾坤")
resp = requests.get(url)
html = etree.HTML(resp.text)
lis = html.xpath("/html/body/div[3]/div[2]/div/div[2]/ul/li")
# 定位文本内容并写入
def Load_txt(li):
url = "http://www.xfuedu.org/bxwx/9613/" + li.xpath("./a/@href")[0]
name = li.xpath("./a/text()")
cont1 = etree.HTML(requests.get(url).text).xpath('//*[@id="content"]/text()')
cont1 = "\n".join(map(str, cont1))
cont2 = etree.HTML(requests.get(url[:-5]+"_2.html").text).xpath('//*[@id="content"]/text()')
cont2 = "\n".join(map(str, cont2))
cont = cont1 + cont2
with open('武动乾坤/' + name[0] + '.txt', 'w') as f:
f.write(cont)
if __name__ == '__main__':
"""创建多线程"""
with ThreadPoolExecutor(50) as t:
for li in lis:
t.submit(Load_txt, li)
print("OK")
多线程爬取《武动乾坤》
最新推荐文章于 2024-08-10 23:14:03 发布