免责声明:此案例为Python入门静态网页新闻爬取,未对目标网站进行大批量访问,不承担任何责任。
写入数据库版本:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import sqlalchemy
import pymysql
pymysql_engine = sqlalchemy.create_engine('mysql+pymysql://root:1234@localhost/sampledb?charset=utf8') #设定数据库
# all_page_news = []
root_url = 'http://www.ftchinese.com/channel/china.html'
for i in range(1,4): #限定爬去前三页
url = root_url + "?page={}".format(i)
print("抓取页面: ",url)
html = requests.get(url)
soup = BeautifulSoup(html.text, "lxml")
items = soup.find("div",class_="items")
one_page_news = []
for i in items("div",class_="item-inner")[:4]: #限定爬去每页前四条
title = i.h2.a.string #新闻标题
tmp_url = "http://www.ftchinese.com" + i.h2.a["href"].split("#")[0] + "?full=y" #对应新闻全文超链接,去除广告
print("抓取新闻: ",tmp_url)
tmp_soup = BeautifulSoup(requests.get(tmp_url).text, "lxml")
try:
text = "\n".join(tmp_soup.find("div",class_="story-body").stripped_strings) #新闻正文,对非文本粗略判定为图片新闻
except AttributeError:
text = "图片新闻"
lead = i.find("div",class_="item-lead", recursive=False).string #新闻摘要
one_news = [title, lead, tmp_url, text]
one_page_news.append(one_news)
# 写入数据库
print("To Database...")
data = np.array(one_page_news)
df = pd.DataFrame(data, columns=['title', 'lead', 'tmp_url', 'text'])
df.to_sql("ftnews",pymysql_engine,if_exists="append")
# data = np.array(all_page_news)
Json版本:
import requests
from bs4 import BeautifulSoup
import json
all_page_news = []
root_url = 'http://www.ftchinese.com/channel/china.html'
for i in range(1,4):
url = root_url + "?page={}".format(i)
print("抓取页面: ",url)
html = requests.get(url)
soup = BeautifulSoup(html.text, "lxml")
items = soup.find("div",class_="items")
one_page_news = []
for i in items("div",class_="item-inner")[:4]:
title = i.h2.a.string
tmp_url = "http://www.ftchinese.com" + i.h2.a["href"].split("#")[0] + "?full=y"
print("抓取新闻: ",tmp_url)
tmp_soup = BeautifulSoup(requests.get(tmp_url).text, "lxml")
try:
text = "\n".join(tmp_soup.find("div",class_="story-body").stripped_strings)
except AttributeError:
text = "图片新闻"
lead = i.find("div",class_="item-lead", recursive=False).string
one_news = [title, lead, tmp_url, text]
one_page_news.append(one_news)
all_page_news.extend(one_page_news)
# 写入json文件
with open("ftnews.json",'w',encoding="utf8") as file:
json.dump(all_page_news,file)
# 读取
# with open("ftnews.json",'r',encoding="utf8") as file:
# data_in = json.load(file)