爬虫之——腾讯新闻简单实例
本实例需要用到两个库:requests&Beautiful Soup
import requests
from bs4 import BeautifulSoup
#获取网页内容
def getHTMLText(url):
try:
r = requests.get(url, timeout = 20)#时延限制为20s
r.raise_for_status() #若有异常则直接抛出
r.encoding=r.apparent_encoding
return r.text
except:
return "网页内容获取失败"
#将内容保存到本地文件中
def saveText(url):
html = getHTMLText(url)
soup = BeautifulSoup(html, "html.parser")
title = soup.select("div.hd > h1")
#取含有a_info类中的div标签里含有a_time类的span标签中的内容
time = soup.select("div.a_Info > span.a_time")
author = soup.select("div.qq_articleFt > div.qq_toolWrap > div.qq_editor")
paras = soup.select("div > p.text")
#以utf-8的形式将新闻写入text文本
with open("text.txt", "w",encoding='UTF-8') as f:
try:
f.writelines(title[0].get_text() + "\n")
f.writelines(time[0].get_text() + "\n")
for para in paras:
if len(para) > 0:
f.writelines(para.get_text() + "\n\n")
f.writelines(author[0].get_text() + '\n')
except:
f.close()
finally:
f.close()
def main():
url = "http://news.qq.com/a/20170504/012032.htm"
saveText(url);
main()
运行上述代码,可以得到text文本内容如下:
小编有话说:本篇文章为对该链接内容的学习笔记https://www.csdn.net/gather_2c/MtjaIgzsODgwLWJsb2cO0O0O.html