直接上代码:
#coding=utf-8
import requests
from bs4 import Tag
from bs4 import BeautifulSoup
def getHtml(url):
page = requests.get(url)
html = page.text
return html
def getImg(html):
get_text = Tag.get_text
soup = BeautifulSoup(html, 'html.parser')
info = soup.find_all('h2')
user = (x.get_text() for x in info) #使用生成器代替list(map(get_text,info))
info = soup.find_all('span',class_="stats-vote")
vote = (x.get_text() for x in info) #使用生成器代替list(map(get_text,info))
info = soup.find_all('div',class_="content")
text = (x.get_text() for x in info) #使用生成器代替list(map(get_text,info))
for x in zip(user,text,vote): #zip函数同时遍历
Enter = input()
if Enter == 'Q' or Enter == 'q':
return -1
print("\n".join((s.strip() for s in x))) #使用生成器代替list(map(str.strip,x))
return 1
if __name__=='__main__':
url = "https://www.qiushibaike.com/hot/"
html,i = getHtml(url),2
print("开始读取数据,请按回车键...按Q键退出\n")
while (getImg(html) == 1):
print("\n第{}页\n".format(i))
url = ("https://www.qiushibaike.com/hot/page/%s/" % str(i))
html,i = getHtml(url),i+1
缺点:无法过滤掉既有文本,又有图片的内容。
待以后优化更新。