import requests
from tqdm import tqdm
from lxml import etree
import pandas as pd
import json
alldata = pd.DataFrame(columns=["title", "href", "content", "date","day"])
alldata.to_csv("data.csv", mode='w', encoding="gbk", index=False)
from tqdm import tqdm
alldata = pd.DataFrame()#预置空数据框,用于储存所有数据
for i in tqdm(range(1,101)):
# print('当前正在爬取第{}页数据'.format(i))
#1 发起请求
url = f'http://www.tipdm.com/gsxw/index_{i}.jhtml'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.78'}
html = requests.get(url,headers = headers)
#2 获取响应
text = html.text
#3 网页解析
#新闻标题、网址、内容、日期:年月日
dom =etree.HTML(text)
title = dom.xpath('//*[@class="con"]/h1/a/text()')
href = dom.xpath('//*[@class="con"]/h1/a/@href')
content = dom.xpath('//*[@class="des"]/text()')
# title = dom.xpath('//*[@id="t251"]/div/div[3]/h1/a/text()')
# href = dom.xpath('//*[@id="t251"]/div/div[3]/h1/a/@rehf')
# content = dom.xpath('//*[@id="t251"]/div/div[3]/div/text()')
date = dom.xpath('//*[@id="t251"]/div/div[1]/span[2]/text()')
day = dom.xpath('//*[@id="t251"]/div/div[1]/span[1]/text()')
if i ==57:
content.insert(2,'')#在第57页的第三条数据插入空值
#4 数据存储【转为DataFrame】
data = pd.DataFrame({'新闻标题':title,
'新闻网址':href,
'新闻内容':content,
'年月':date,
'天':day
})
# data.to_csv('data.csv',encoding='gbk',index=False,mode='a+')#方法一:追加输出
alldata = pd.concat([data,alldata],axis=0,ignore_index=True)#数据合并
爬虫脚本,用于从指定的网站抓取新闻信息,抓取多页
最新推荐文章于 2024-09-11 10:08:47 发布