使用Python3+Request实现糗事百科爬虫
import requests,threading,time,re
class Spider_QSBK:
def __init__(self):
self.page = 1
self.pages = []
self.enable = False
def getPage(self,page):
url = 'http://www.qiushibaike.com/hot/page/'+page
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safaria/537.36'
headers = {'User-Agent':user_agent}
response = requests.get(url,headers=headers)
items = []
if response.status_code == 200:
response.encoding = 'utf-8'
myItems = re.findall(r'<div class="content">.*?<span>(.*?)</span>.*?</div>',response.text,re.S)
for item in myItems:
items.append(item.replace("\n",""))
else:
print('第%d页下载失败:[%d] [%s]'%(page,response.status_code,response.reason))
return items
def loadPage(self):
while self.enable:
if len(self.pages) < 2:
myPage = self.getPage(str(self.page))
if len(myPage)>0:
self.page += 1
self.pages.append(myPage)
else:
time.sleep(1)
def start(self):
self.enable = True
page = self.page
print('正在加载中请稍候...')
work_thread = threading.Thread(target=self.loadPage,args=(),name="worker")
work_thread.start()
while self.enable:
if self.pages:
nowPage = self.pages[0]
del self.pages[0]
self.showPage(nowPage,page)
page += 1
def showPage(self,nowPage,page):
for items in nowPage:
print('第%d页'%page)
print(items)
myInput = input(' ')
if myInput=="quit":
self.enable = False
break;
print('请按下回车开始浏览今日糗事百科内容: ')
input(' ')
myspider = Spider_QSBK()
myspider.start()