# -*- coding:utf-8 -*- import requests from bs4 import BeautifulSoup from itertools import izip import time #伪装成浏览器 user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = {'User-Agent': user_agent} def get_url(start, end): #根据网址结构构造爬取多页的函数 for num in (start, end): url = 'http://www.qiushibaike.com/8hr/page/'+str(num)+'/?s=4950794' get(url) time.sleep(3) def get(url): #删选出搞笑数大于1000并且评论在十条以上的段子 wb = requests.get(url,headers = headers) soup = BeautifulSoup(wb.text,'lxml') contents = soup.select('a > div[class="content"] > span') marks = soup.select('div > span > i') comments = soup.select('div > span > a > i') for content, mark, comment in izip(contents, marks, comments): if int(mark.get_text()) > 1000 and int(comment.get_text()) > 10: print content.get_text() print if __name__ == "__main__": get_url(1, 5)
一个爬取糗事百科段子的简单爬虫
最新推荐文章于 2021-02-12 11:47:32 发布