'''
爬取页面:22-31 , 共(7815)条数据
'''
import requests,time
from lxml import etree
from pymysql import *
class DataItem(object):
def __init__(self,theme,sourcetype,sourcename,url):
self.tree = self.request_html(url)
self.theme = theme # 归类--eg:机构消息,公文
self.sourcetype = sourcetype # 信源类型--eg:中央国家机关
self.sourcename = sourcename # 信源名称--eg:农业部,商务部
self.c_time = time.strftime('%Y-%m-%d') # 爬虫时间
self.listname = ''# 标题
self.content = '' # 正文
self.abstract = '' # 摘要
self.p_s_time = 0 # 发布时间时间戳
self.p_time = '' # 发布时间
self.link = '' # url地址
self.keywords = '' # 关键词
# self.parse1_html()
self.parse2_html()
def request_html(self,url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
}
response = requests.get(url, headers=headers).content.decode('utf-8')
# print(responses)
# with open('人事工作动态.html','w',encoding='utf-8') as f:
# f.write(response)
tree = etree.HTML(response)
return tree
# 第一版
def parse1_html(self):
href_list = self.tree.xpath('//ul[@class="u-newsList01 f-mt10"]//a/@href')
# print(href_list)
base_url = 'http://www.mofcom.gov.cn'
for href in href_list:
detail_url = base_url + href
# 根据详情页路由发起第二次请求
# with open('人事工作动态_详情.html','w',encoding='utf-8') as f:
# f.write(response2)
# print(response2)
tree2 = self.request_html(detail_url)
listname_list = tree2.xpath('//head/title/text()')
# print(listname_list)
self.listname = listname_list[0]
# print(listname)
content_list = tree2.xpath('//div[@id="zoom"]//p//text()')
if content_list:
self.content = ''.join(content_list)
else:
content_list = tree2.xpath('//section[@id="zoom"]//p//text()')
self.content = ''.join(content_list)
# print(content)
p_time_list = tree2.xpath('//head/meta[@name="PubDate"]/@content')
# print(p_time_list)
self.p_time = p_time_list[0]
print(self.p_time)
self.link = detail_url
# 调用写入数据库函数
# self.write_mysql()
# 优化版
def parse2_html(self):
href_list = self.tree.xpath('//ul[@class="u-newsList01 f-mt10"]//a/@href')
# print(href_list)
base_url = 'http://www.mofcom.gov.cn'
for href in href_list:
detail_url = base_url + href
# 根据详情页路由发起第二次请求
# with open('人事工作动态_详情.html','w',encoding='utf-8') as f:
# f.write(response2)
# print(response2)
tree2 = self.request_html(detail_url)
listname_list = tree2.xpath('//head/title/text()')
# print(listname_list)
self.listname = listname_list[0]
# print(self.listname)
# 获取正文 , 先判断正文是否是图片
img_list = tree2.xpath('//div[@id="zoom"]//p/img/@src')
if img_list:
self.content = img_list[0]
else:
content_list = tree2.xpath('//div[@id="zoom"]//p//text()')
if content_list:
self.content = ''.join(content_list)
else:
content_list = tree2.xpath('//section[@id="zoom"]//p//text()')
self.content = ''.join(content_list)
# print(self.content)
p_time_list = tree2.xpath('//head/meta[@name="PubDate"]/@content')
# print(p_time_list)
self.p_time = p_time_list[0]
print(self.p_time)
self.link = detail_url
# 调用写入数据库函数
self.write_mysql()
# 判断正文中是否有其他链接 , 获取后并跳转 , 有的链接已失效
content_url_list = tree2.xpath('//div[@class="relative"]//li/a/@href')
if content_url_list:
for redirect_url in content_url_list:
try:
tree3 = self.request_html(redirect_url)
listname_list = tree3.xpath('//head/title/text()')
# print(listname_list)
self.listname = listname_list[0]
# print(self.listname)
# 获取正文 , 先判断正文是否是图片
img_list = tree2.xpath('//div[@id="zoom"]//p/img/@src')
if img_list:
self.content = img_list[0]
else:
content_list = tree2.xpath('//div[@id="zoom"]//p//text()')
if content_list:
self.content = ''.join(content_list)
else:
content_list = tree2.xpath('//section[@id="zoom"]//p//text()')
self.content = ''.join(content_list)
# print(self.content)
p_time_list = tree3.xpath('//head/meta[@name="PubDate"]/@content')
# print(p_time_list)
self.p_time = p_time_list[0]
print(self.p_time)
self.link = redirect_url
except:
self.link = redirect_url
self.listname = '此页面已不存在'
self.content = '无'
self.p_time = '无'
# 调用写入数据库函数
self.write_mysql()
# 写入数据库
def write_mysql(self):
conn = connect(
host = 'localhost',
database = 'changwei',
user = 'root',
password = '200311',
charset = 'utf8',
port = 3306
)
cur = conn.cursor()
sql = 'insert into dataitem (theme,sourcetype,sourcename,c_time,listname,content,abstract,p_s_time,p_time,link,keywords) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
lis = [self.theme, self.sourcetype, self.sourcename, self.c_time, self.listname, self.content, self.abstract,
self.p_s_time, self.p_time, self.link, self.keywords]
cur.execute(sql,lis)
conn.commit()
cur.close()
conn.close()
if __name__ == '__main__':
# 人事工作动态(完成)
# for i in range(2,6):
# url = 'http://www.mofcom.gov.cn/article/jhguihua/redianzhuizong/?{}'.format(i)
# DataItem('公文','中央国家机关','商务部',url)
#政策发布(完成)
# for i in range(1,4):
# url = 'http://www.mofcom.gov.cn/article/cwgongzuo/huiybz/?{}'.format(i)
# DataItem('公文', '中央国家机关', '商务部', url)
# 招标邀标公告(完成)
# for i in range(8,45):
# time.sleep(0.5)
# url = 'http://www.mofcom.gov.cn/article/cwgongzuo/jingmaoluntan/zhaobiao/?{}'.format(i)
# DataItem('公文', '中央国家机关', '商务部', url)
# 单一来源公告
# for i in range(1,3):
# # time.sleep(0.5)
# url = 'http://www.mofcom.gov.cn/article/cwgongzuo/jingmaoluntan/dyly/?{}'.format(i)
# DataItem('公文', '中央国家机关', '商务部', url)
# 竞争性谈判公告
# for i in range(1, 8):
# # time.sleep(0.5)
# url = 'http://www.mofcom.gov.cn/article/cwgongzuo/jingmaoluntan/jzxtp/?{}'.format(i)
# DataItem('公文', '中央国家机关', '商务部', url)
# 预审公告
# for i in range(1, 3):
# # time.sleep(0.5)
# url = 'http://www.mofcom.gov.cn/article/cwgongzuo/jingmaoluntan/ysgg/?{}'.format(i)
# DataItem('公文', '中央国家机关', '商务部', url)
# 更正公告
# for i in range(1, 23):
# # time.sleep(0.5)
# url = 'http://www.mofcom.gov.cn/article/cwgongzuo/jingmaoluntan/gengzheng/?{}'.format(i)
# DataItem('公文', '中央国家机关', '商务部', url)
# 中标公告
# for i in range(23, 29):
# time.sleep(0.5)
# url = 'http://www.mofcom.gov.cn/article/cwgongzuo/jingmaoluntan/zhongbiao/?{}'.format(i)
# DataItem('公文', '中央国家机关', '商务部', url)
# 废标流标公告
# for i in range(1,12):
# time.sleep(0.5)
# url = 'http://www.mofcom.gov.cn/article/cwgongzuo/jingmaoluntan/feibiao/?{}'.format(i)
# DataItem('公文', '中央国家机关', '商务部', url)
# 结果公告
for i in range(1,22):
time.sleep(0.5)
url = 'http://www.mofcom.gov.cn/article/cwgongzuo/jingmaoluntan/jieguo/?{}'.format(i)
DataItem('公文', '中央国家机关', '商务部', url)
爬虫-商务部新闻
最新推荐文章于 2021-02-26 20:48:18 发布