IO读写操作比较适合使用多线程,虽然python的多线程是伪多线程,但是CPU快速的在读取和写入之间切换,边爬取边写入,会比逐条进行读写的速度快。
import re
import requests
import csv
from queue import Queue
import threading
class POSpider(threading.Thread):
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36 QIHU 360SE','cookie':'Hm_lvt_9007fab6814e892d3020a64454da5a55=1599659724,1599704234; Hm_lpvt_9007fab6814e892d3020a64454da5a55=1599713077'}
def __init__(self,page_queue,poems_queue,*args,**kwargs):
super(POSpider,self).__init__(*args,**kwargs)
self.page_queue = page_queue
self.poems_queue = poems_queue
def run(self):
sortP = []
while True:
if self.page_queue.empty():
break
url = self.page_queue.get()
response = requests.get(url,headers=self.headers)
text = response.text
titles = re.findall(r'div\sclass="cont".*?<b>(.*?)</b>',text,re.S)#爬取诗的标题
poems = re.findall(r'div\sclass="contson".*?">.*?(.*?)</div>',text,re.S) #爬取诗的内容
for poem in poems:
sortPoems = re.sub('<.*?>','',poem)
sortP.append(sortPoems) #诗中有多余的标签,用空格替换掉,遍历加入新的列表
for x,y in zip(titles,sortP): #一次遍历两个列表
self.poems_queue.put((x,y)) #把标题和诗放入队列
# print(x,y) #打印爬取的标题和内容
print('+'*30+"第%s页已下载完成!" %url.split('0AA')[-1]+'+'*30)
class POWriter(threading.Thread):
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36 QIHU 360SE','cookie':'Hm_lvt_9007fab6814e892d3020a64454da5a55=1599659724,1599704234; Hm_lpvt_9007fab6814e892d3020a64454da5a55=1599713077'}
def __init__(self,poems_queue,gLock,writer,*args,**kwargs):
super(POWriter,self).__init__(*args,**kwargs)
self.poems_queue = poems_queue
self.writer = writer
self.lock = gLock
def run(self):
while True:
try:
poem_info = self.poems_queue.get(timeout=50)#获取到队列,超时
x,y = poem_info
self.lock.acquire()
self.writer.writerow((x,y)) #一次写入一首诗
self.lock.release()
print('保存一首')
except:
break
def main():
page_queue = Queue(10)
poems_queue = Queue(30)
gLock = threading.Lock()
fp = open('chinapoems.csv','a',newline='',encoding='gbk')#用utf-8会乱码,gb2312部分不能写入
writer = csv.writer(fp)
writer.writerow(('title','content'))
for x in range(1,6):
url = 'https://so.gushiwen.cn/shiwen/default_0AA%d.aspx' % x
page_queue.put(url)
for x in range(5):
t = POSpider(page_queue,poems_queue)
t.start()
for x in range(5):
t = POWriter(poems_queue,gLock,writer)
t.start()
if __name__ == "__main__":
main()
爬取的内容不存在问题,遍历的页面过多,写入到CSV速度很慢,而且会假死。分析大概是有比较复杂的中文字体,在编码的时候,写入困难。
事实证明的确是因为曹操诗句中有个复杂的汉字:䜩,以至于进行不下去。
UnicodeEncodeError details: ‘gbk’ codec can’t encode character ‘\u4729’ in position 126: illegal multibyte sequence 短歌行 两汉:曹操
对酒当歌,人生几何!譬如朝露,去日苦多。慨当以慷,忧思难忘。何以解忧?唯有杜康。青青子衿,悠悠我心。但为君故,沉吟至今。呦呦鹿鸣,食野之苹。我有嘉宾,鼓瑟吹笙。明明如月,何时可掇?忧从中来,不可断绝。越陌度阡,枉用相存。契阔谈䜩,心念旧恩。(谈䜩 一作:谈宴)月明星稀,乌鹊南飞。绕树三匝,何枝可依?山不厌高,海不厌深。周公吐哺,天下归心。
import re
import requests
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36 QIHU 360SE','cookie':'Hm_lvt_9007fab6814e892d3020a64454da5a55=1599659724,1599704234; Hm_lpvt_9007fab6814e892d3020a64454da5a55=1599713077'}
def parse_page(url):
poemsG = []
authorG = []
response = requests.get(url,headers=headers)
text = response.text
titles = re.findall(r'div\sclass="cont".*?<b>(.*?)</b>',text,re.S)#爬取诗的标题
poems = re.findall(r'div\sclass="contson".*?">.*?(.*?)</div>',text,re.S) #爬取诗的内容
authors = re.findall(r'p\sclass="source".*?>.*?(.*?)</p>',text,re.S)
for i in authors:
author = re.sub('<.*?>','',i)
authorG.append(author)
for i in poems:
Poem = re.sub('<.*?>','',i)
poemsG.append(Poem) #诗中有多余的标签,用空格替换掉,遍历加入新的列表
g = list(map(lambda x,y,z:'\t'+x+'\t\t'+y+z+'\n',titles,authorG,poemsG)) #列表一一映射组合在一起成新列表,合适的组合空格加上回车
return g
def main():
for x in range(1,40):
url = 'https://so.gushiwen.cn/shiwen/default_0AA%d.aspx' % x
contents = parse_page(url)
for i in contents:
try:
with open('chinapoems'+'.txt','a') as f:# write()不能是字典、元组和列表
f.write(i)
except UnicodeEncodeError as e:
print("UnicodeEncodeError details: " +str(e)+i) #显示到底是哪里出错了
pass
if __name__ == "__main__":
main()