爬取魔法部每日的新闻记者会发言稿
1.中文版:
import requests
import bs4
import os
import datetime
import time
from lxml import etree
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
}
lists = []
lists_data_url = 'https://www.fmprc.gov.cn/fyrbt_673021/jzhsl_673025/index.shtml'
resp_lists = requests.get(lists_data_url, headers= headers)
e_list = etree.HTML(resp_lists.text)
lists_data_id = e_list.xpath('//div[@class="newsBd"]/ul[@class="list1"]/li/a/@href')#每条新闻的id
lists = []
lists = lists + lists_data_id
print(lists)
for i in range(1,8):
lists_data_url = 'https://www.fmprc.gov.cn/fyrbt_673021/jzhsl_673025/index_'+str(i)+'.shtml'
resp_lists = requests.get(lists_data_url, headers= headers)
e_list = etree.HTML(resp_lists.text)
lists_data_id = e_list.xpath('//div[@class="newsBd"]/ul[@class="list1"]/li/a/@href')#每条新闻的id
lists = lists + lists_data_id
print(lists)
def fetchUrl(url):
'''
功能:访问 url 的网页,获取网页内容并返回
参数:目标网页的 url
返回:目标网页的 html 内容
'''
r = requests.get(url, headers=headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
def getContent(html):
'''
功能:解析 HTML 网页,获取新闻的文章内容
参数:html 网页内容
'''
bsobj = bs4.BeautifulSoup(html,'html.parser')
# 获取文章 标题
title = bsobj.h1.text + '\n'
print(title)
# 获取文章 内容
pList = bsobj.find('div', attrs = {'class': 'news-main'}).find_all('p')
content = ''
for p in pList:
content += p.text + '\n'
print(content)
# 返回结果 标题+内容
resp = title + content
return resp
def gettitle(html):
'''
功能:解析 HTML 网页,获取新闻的文章内容
参数:html 网页内容
'''
bsobj = bs4.BeautifulSoup(html,'html.parser')
# 获取文章 标题
title = bsobj.h1.text
print(title)
return title
def saveFile(content, path, filename):
'''
功能:将文章内容 content 保存到本地文件中
参数:要保存的内容,路径,文件名
'''
# 如果没有该文件夹,则自动生成
if not os.path.exists(path):
os.makedirs(path)
# 保存文件
with open(path + filename, 'a', encoding='utf-8') as f:
f.write(content)
for num in lists:
eachday_url = f'https://www.fmprc.gov.cn/fyrbt_673021/jzhsl_673025{num[1:]}' # 每条新闻的url
print(eachday_url)
resp = requests.get(eachday_url, headers=headers)
e = etree.HTML(resp.text)
html = fetchUrl(eachday_url)
content = getContent(html)
filename = '外交部发言中文'
path = "D:/wjbremarks/"
saveFile(content, path, filename)
保存结果:
pdf版是打开后另存为的,内容敏感就不放图片了
2.英文版
import requests
import bs4
import os
import datetime
import time
from lxml import etree
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
}
lists = []
lists_data_url = 'https://www.fmprc.gov.cn/eng/xwfw_665399/s2510_665401/2511_665403/'
resp_lists = requests.get(lists_data_url, headers= headers)
e_list = etree.HTML(resp_lists.text)
lists_data_id = e_list.xpath('//div[@class="newsLst_mod"]/ul/li/a/@href')#每条新闻的id
lists = []
lists = lists + lists_data_id
print(lists)
for i in range(1,7):
lists_data_url = 'https://www.fmprc.gov.cn/eng/xwfw_665399/s2510_665401/2511_665403/index_'+str(i)+'.html'
print(lists_data_url)
resp_lists = requests.get(lists_data_url, headers= headers)
e_list = etree.HTML(resp_lists.text)
lists_data_id = e_list.xpath('//div[@class="newsLst_mod"]/ul/li/a/@href')#每条新闻的id
print(lists_data_id)
lists = lists + lists_data_id
print(lists)
def fetchUrl(url):
'''
功能:访问 url 的网页,获取网页内容并返回
参数:目标网页的 url
返回:目标网页的 html 内容
'''
r = requests.get(url, headers=headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
def getContent(html):
'''
功能:解析 HTML 网页,获取新闻的文章内容
参数:html 网页内容
'''
bsobj = bs4.BeautifulSoup(html,'html.parser')
# 获取文章 标题
title = bsobj.h2.text + '\n'
print(title)
# 获取文章 内容
try:
pList = bsobj.find('div', attrs = {'class': 'trs_editor_view TRS_UEDITOR trs_paper_default trs_web'}).find_all('p')
except:
try:
pList = bsobj.find('div', attrs={'class': 'trs_editor_view TRS_UEDITOR trs_paper_default trs_word trs_web'}).find_all(
'p')
except:
pList = bsobj.find('div', attrs={'class': 'trs_editor_view TRS_UEDITOR trs_paper_default trs_web trs_word'}).find_all(
'p')
content = ''
for p in pList:
content += p.text + '\n'
print(content)
# 返回结果 标题+内容
resp = title + content
return resp
def gettitle(html):
'''
功能:解析 HTML 网页,获取新闻的文章内容
参数:html 网页内容
'''
bsobj = bs4.BeautifulSoup(html,'html.parser')
# 获取文章 标题
title = bsobj.h2.text
print(title)
return title
def saveFile(content, path, filename):
'''
功能:将文章内容 content 保存到本地文件中
参数:要保存的内容,路径,文件名
'''
# 如果没有该文件夹,则自动生成
if not os.path.exists(path):
os.makedirs(path)
# 保存文件
with open(path + filename, 'a', encoding='utf-8') as f:
f.write(content)
for num in lists:
eachday_url = f'https://www.fmprc.gov.cn/eng/xwfw_665399/s2510_665401/2511_665403/{num[1:]}' # 每条新闻的url
print(eachday_url)
resp = requests.get(eachday_url, headers=headers)
e = etree.HTML(resp.text)
html = fetchUrl(eachday_url)
content = getContent(html)
filename = '外交部发言英文'
path = "D:/wjbremarksEN/"
saveFile(content, path, filename)