平安实习—requests库练习（爬取网站信息）

最新推荐文章于 2021-12-12 15:16:08 发布

Havertz_PPC

最新推荐文章于 2021-12-12 15:16:08 发布

阅读量242

点赞数

文章标签： python 正则表达式

本文链接：https://blog.csdn.net/qq_38480908/article/details/112789258

版权

# coding: utf-8


# 0、导入所需要的包
import requests
from lxml import etree
import time


# 1、参数配置
# 1.1 指定爬取的URL
origin_url = str(input("请输入爬取的目标网站（去除页码后缀）：")) + "%s"

# 1.2 生成需要爬取的页码
first_page = int(input("请输入爬取的第一页："))
last_page = int(input("请输入爬取的最后一页：")) + 1
pages = [x for x in range(first_page, last_page)]

# 1.3 配置解析网页所用的 xpath 路径
xpath_dict = {
    "titles_xpath": input("请输入标题的xpath解析方法："),
    "times_xpath": input("请输入时间的xpath解析方法："),
    "urls_xpath": input("请输入URL的xpath解析方法："),
    "contents_xpath": input("请输入网页内容的xpath解析方法："),
    "sources_xpath": input("请输入新闻来源的xpath解析方法：")
}



# 2、请求网页信息
def Request(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36",
   
    }
    response = requests.get(url, headers=headers)
    response.encoding = 'utf-8'
    text = response.text
    return text


# 3、解析网页
def ParsePage(text, xpath_dict):
    html = etree.HTML(text)
    # 3.1 提取标题，返回为一个列表
    titles = html.xpath(xpath_dict["titles_xpath"])
    # 3.2 提取时间，返回为一个列表
    times = html.xpath(xpath_dict["times_xpath"])
    # 3.3 提取URL，返回为一个列表
    urls = html.xpath(xpath_dict["urls_xpath"])

    # 3.4 遍历每一页的URL，提取网页内容和网页来源
    contents = []
    sources = []
    for url in urls:
        try:
            new_text = Request(url)
            new_html = etree.HTML(new_text)
            content = new_html.xpath(xpath_dict["contents_xpath"])
            # 对返回网页内容进行清洗，去除空格，换行符和英文逗号等
            new_content = ''.join(content).replace(",", "，").replace(" ", "")\
                .replace("\r", "").replace("\n", "").strip()
            contents.append(new_content)
            source = new_html.xpath(xpath_dict["sources_xpath"])
            # 对返回信息来源进行清洗，去除空格，换行符和英文逗号等
            new_source = "".join(source).replace(",", "，").replace(" ", "")\
                .replace("\r", "").replace("\n", "").strip()
            sources.append(new_source)
        except:
            print("%s解析失败！" % url)
            continue

    # 3.5 将所有爬取信息合并到一个列表中，作为函数返回值返回，便于后期的信息保存。
    datas = []
    datas.append(titles)
    datas.append(times)
    datas.append(contents)
    datas.append(urls)
    datas.append(sources)
    return datas


# 4、保存信息
def SaveFiles(datas, page):
    filename = '第%s页内容.csv' % page
    with open(filename, "w", encoding="utf-8") as fp:
        # 写入文件的标题
        title = ["标题", "时间", "内容", "URL", "来源"]
        fp.write(str(title[0]))
        fp.write(",")
        fp.write(str(title[1]))
        fp.write(",")
        fp.write(str(title[2]))
        fp.write(",")
        fp.write(str(title[3]))
        fp.write(",")
        fp.write(str(title[4]))
        fp.write("\n")

        # 写入需要保存的信息
        for i in range(len(datas[0])):
            fp.write(str(datas[0][i]))
            fp.write(",")
            fp.write(str(datas[1][i]))
            fp.write(",")
            fp.write(str(datas[2][i]))
            fp.write(",")
            fp.write(str(datas[3][i]))
            fp.write(",")
            fp.write(str(datas[4][i]))
            fp.write("\n")
        fp.close()


# 5、写主函数内容
def Main():
    for page in pages:
            url = origin_url % page
            res_text = Request(url)
            dicts = xpath_dict
            res_datas = ParsePage(res_text, dicts)
            SaveFiles(res_datas, page)
            print("第%s页解析完毕！" % page)
            time.sleep(1)
    print("全部页面解析完成！")


# 6、调用主函数
if __name__ == '__main__':
    Main()

Havertz_PPC

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
2
评论
平安实习—requests库练习（爬取网站信息）

# coding: utf-8# 0、导入所需要的包import requestsfrom lxml import etreeimport time# 1、参数配置# 1.1 指定爬取的URLorigin_url = str(input("请输入爬取的目标网站（去除页码后缀）：")) + "%s"# 1.2 生成需要爬取的页码first_page = int(input("请输入爬取的第一页："))last_page = int(input("请输入爬取的最后一页：")) +
复制链接

扫一扫