from concurrent import futures
import time
start_time=time.time()
print(start_time)
from concurrent.futures import ThreadPoolExecutor,as_completed
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
import threading
import requests
from bs4 import BeautifulSoup
from lxml import etree
import queue
import random
def craw(url):
r=requests.get(url,timeout=30,verify=False)
r.encoding="utf-8"
r=r.text
return(r)
def parse(html):
selector = etree.HTML(html)
#提取小说各页对应的url的id
x_title="//div[@class='read_title']//h1[1]/text()"
title=selector.xpath(x_title)[0]
return(title)
r=craw("https://www.soshuw.com/GuiMiZhiZhu/")
selector = etree.HTML(r)
#提取小说各页对应的url的id
x_url='//*[@id="novel50348"]/dl[1]/dd/a/@href'
urls=selector.xpath(x_url)
p_url=[f'https://www.soshuw.com{i}' for i in urls]
def do_craw(url_queue:queue.Queue,html_queue:queue.Queue):
while True:
url=url_queue.get()
html = craw(url)
html_queue.put(html)
print(threading.current_thread().name, f"craw {url}",
"url_queue.size=", url_queue.qsize())
time.sleep(random.randint(1,2))
def do_parse(html_queue: queue.Queue,fout):
while True:
html = html_queue.get()
results = parse(html)
print('results:',results)
#fout=open("k:/zhusc/1.txt", "a",encoding='utf-8')
fout.write(str(results) + "\n")
print(threading.current_thread().name, f"results.size", len(results),
"html_queue.size=", html_queue.qsize())
time.sleep(random.randint(1,2))
if __name__ == "__main__":
url_queue = queue.Queue()
html_queue = queue.Queue()
for url in p_url:
url_queue.put(url)
for idx in range(3):
t = threading.Thread(target=do_craw, args=(url_queue, html_queue),
name=f"craw{idx}")
t.start()
fout = open("k:/zhusc/1.txt", "a",encoding='utf-8')
for idx in range(2):
t = threading.Thread(target=do_parse, args=(html_queue,fout),
name=f"parse{idx}")
t.start()
print(time.time()-start_time)
为什么将第49行代码注释掉,就无法将数据写入1.txt?第66行代码传参给第68行也不行吗?