文章目录
2020-05-03日爬虫练习
每日一个爬虫小练习,学习爬虫的记得关注哦!
学习编程就像学习骑自行车一样,对新手来说最重要的是持之以恒的练习。
在《汲取地下水》这一章节中看见的一句话:“别担心自己的才华或能力不足。持之以恒地练习,才华便会有所增长”,现在想来,真是如此。
'''
多线程爬取百思不得姐
version:01
author:金鞍少年
date:2020-05-02
'''
from concurrent.futures import ThreadPoolExecutor
import requests
from bs4 import BeautifulSoup
import csv,time
class budejie:
def __init__(self):
self.pool = ThreadPoolExecutor(10) # 开10个线程的线程池
self.switch = False
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36',
'Referer': 'http://www.budejie.com/text/'
}
# 获取网页html
def gethtml(self, url):
try:
res = requests.get(url=url, headers=self.headers)
res.raise_for_status()
html = BeautifulSoup(res.content,'lxml')
return html
except Exception as e:
print(e)
# 解析网页,得到 网址和段子正文
def parsehtml(self,html):
html = html.result(timeout=None)
talks = {}
text = html.find_all('div', class_="j-r-list-c")
for i in text:
href = 'http://www.budejie.com' + i.a['href']
cross_talk = i.get_text().strip()
talks[href] = cross_talk
self.pool.submit(self.savetalk,talks)
# 储存本地
def savetalk(self, talks):
with open('百思不得姐.csv','w',encoding='utf-8',newline='')as f:
for talk in talks:
writer = csv.writer(f)
writer.writerow((talk, talks[talk]))
print(talk,'写入成功!')
self.switch = True
def main(self,url):
self.pool.submit(self.gethtml, url).add_done_callback(self.parsehtml)
while not self.switch: # 防止主线程结束
time.sleep(0.00001)
self.pool.shutdown() # 关闭线程池
if __name__ == '__main__':
obj = budejie()
obj.main('http://www.budejie.com/text/')