# coding: utf-8
# 0、导入所需要的包
import requests
from lxml import etree
import time
# 1、参数配置
# 1.1 指定爬取的URL
origin_url = str(input("请输入爬取的目标网站(去除页码后缀):")) + "%s"
# 1.2 生成需要爬取的页码
first_page = int(input("请输入爬取的第一页:"))
last_page = int(input("请输入爬取的最后一页:")) + 1
pages = [x for x in range(first_page, last_page)]
# 1.3 配置解析网页所用的 xpath 路径
xpath_dict = {
"titles_xpath": input("请输入标题的xpath解析方法:"),
"times_xpath": input("请输入时间的xpath解析方法:"),
"urls_xpath": input("请输入URL的xpath解析方法:"),
"contents_xpath": input("请输入网页内容的xpath解析方法:"),
"sources_xpath": input("请输入新闻来源的xpath解析方法:")
}
# 2、请求网页信息
def Request(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36",
}
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
text = response.text
return text
# 3、解析网页
def ParsePage(text, xpath_dict):
html = etree.HTML(text)
# 3.1 提取标题,返回为一个列表
titles = html.xpath(xpath_dict["titles_xpath"])
# 3.2 提取时间,返回为一个列表
times = html.xpath(xpath_dict["times_xpath"])
# 3.3 提取URL,返回为一个列表
urls = html.xpath(xpath_dict["urls_xpath"])
# 3.4 遍历每一页的URL,提取网页内容和网页来源
contents = []
sources = []
for url in urls:
try:
new_text = Request(url)
new_html = etree.HTML(new_text)
content = new_html.xpath(xpath_dict["contents_xpath"])
# 对返回网页内容进行清洗,去除空格,换行符和英文逗号等
new_content = ''.join(content).replace(",", ",").replace(" ", "")\
.replace("\r", "").replace("\n", "").strip()
contents.append(new_content)
source = new_html.xpath(xpath_dict["sources_xpath"])
# 对返回信息来源进行清洗,去除空格,换行符和英文逗号等
new_source = "".join(source).replace(",", ",").replace(" ", "")\
.replace("\r", "").replace("\n", "").strip()
sources.append(new_source)
except:
print("%s解析失败!" % url)
continue
# 3.5 将所有爬取信息合并到一个列表中,作为函数返回值返回,便于后期的信息保存。
datas = []
datas.append(titles)
datas.append(times)
datas.append(contents)
datas.append(urls)
datas.append(sources)
return datas
# 4、保存信息
def SaveFiles(datas, page):
filename = '第%s页内容.csv' % page
with open(filename, "w", encoding="utf-8") as fp:
# 写入文件的标题
title = ["标题", "时间", "内容", "URL", "来源"]
fp.write(str(title[0]))
fp.write(",")
fp.write(str(title[1]))
fp.write(",")
fp.write(str(title[2]))
fp.write(",")
fp.write(str(title[3]))
fp.write(",")
fp.write(str(title[4]))
fp.write("\n")
# 写入需要保存的信息
for i in range(len(datas[0])):
fp.write(str(datas[0][i]))
fp.write(",")
fp.write(str(datas[1][i]))
fp.write(",")
fp.write(str(datas[2][i]))
fp.write(",")
fp.write(str(datas[3][i]))
fp.write(",")
fp.write(str(datas[4][i]))
fp.write("\n")
fp.close()
# 5、写主函数内容
def Main():
for page in pages:
url = origin_url % page
res_text = Request(url)
dicts = xpath_dict
res_datas = ParsePage(res_text, dicts)
SaveFiles(res_datas, page)
print("第%s页解析完毕!" % page)
time.sleep(1)
print("全部页面解析完成!")
# 6、调用主函数
if __name__ == '__main__':
Main()
平安实习—requests库练习(爬取网站信息)
最新推荐文章于 2021-12-12 15:16:08 发布