#线程池:一次性开辟一些线程。我们用户直接给线程池提交任务,线程任务的调度交给线程池来完成 '''from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor def fn(name): for i in range(100): print(name,i) if __name__=='__main__': #创建线程池 with ThreadPoolExecutor(50) as t: for i in range(100): t.submit(fn,name=f"线程{i}") #等待线程池中的人物全部执行完毕,才继续执行 print("123")''' #实例 #如何提取多个页面的数据 #利用线程池,多个页面同时抓取 import requests from lxml import etree import csv from concurrent.futures import ThreadPoolExecutor f=open("date.csv",mode="w",encoding="utf-8") csvwriter=csv.writer(f) def download_onepage(url): resp=requests.get(url) html=etree.HTML(resp.text) table=html.xpath("/html/body/div[2]/div/div/div/div[4]/div[1]/div/table")[0] trs=table.xpath("./tr") #print(len(trs)) #print(resp.text) for tr in trs: txt=tr.xpath("./td/test()") txt=item.replace("//","").replace("/","")for item in txt csvwriter.writerow(txt) print(list(txt)) pass if __name__=='__main__': #for i in range(1,14870): #download_onepage("http://www.xinfadi.com.cn/marketanalysis/0/list/{i}.shtml") with ThreadPoolExecutor(50) as t: for i in range(1,14870): t.download_onepage("http://www.xinfadi.com.cn/marketanalysis/0/list/{i}.shtml")
学习爬虫第九天
最新推荐文章于 2024-08-10 10:09:09 发布