平安实习—requests库练习(爬取网站信息)

# coding: utf-8


# 0、导入所需要的包
import requests
from lxml import etree
import time


# 1、参数配置
# 1.1 指定爬取的URL
origin_url = str(input("请输入爬取的目标网站(去除页码后缀):")) + "%s"

# 1.2 生成需要爬取的页码
first_page = int(input("请输入爬取的第一页:"))
last_page = int(input("请输入爬取的最后一页:")) + 1
pages = [x for x in range(first_page, last_page)]

# 1.3 配置解析网页所用的 xpath 路径
xpath_dict = {
    "titles_xpath": input("请输入标题的xpath解析方法:"),
    "times_xpath": input("请输入时间的xpath解析方法:"),
    "urls_xpath": input("请输入URL的xpath解析方法:"),
    "contents_xpath": input("请输入网页内容的xpath解析方法:"),
    "sources_xpath": input("请输入新闻来源的xpath解析方法:")
}



# 2、请求网页信息
def Request(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36",
   
    }
    response = requests.get(url, headers=headers)
    response.encoding = 'utf-8'
    text = response.text
    return text


# 3、解析网页
def ParsePage(text, xpath_dict):
    html = etree.HTML(text)
    # 3.1 提取标题,返回为一个列表
    titles = html.xpath(xpath_dict["titles_xpath"])
    # 3.2 提取时间,返回为一个列表
    times = html.xpath(xpath_dict["times_xpath"])
    # 3.3 提取URL,返回为一个列表
    urls = html.xpath(xpath_dict["urls_xpath"])

    # 3.4 遍历每一页的URL,提取网页内容和网页来源
    contents = []
    sources = []
    for url in urls:
        try:
            new_text = Request(url)
            new_html = etree.HTML(new_text)
            content = new_html.xpath(xpath_dict["contents_xpath"])
            # 对返回网页内容进行清洗,去除空格,换行符和英文逗号等
            new_content = ''.join(content).replace(",", ",").replace(" ", "")\
                .replace("\r", "").replace("\n", "").strip()
            contents.append(new_content)
            source = new_html.xpath(xpath_dict["sources_xpath"])
            # 对返回信息来源进行清洗,去除空格,换行符和英文逗号等
            new_source = "".join(source).replace(",", ",").replace(" ", "")\
                .replace("\r", "").replace("\n", "").strip()
            sources.append(new_source)
        except:
            print("%s解析失败!" % url)
            continue

    # 3.5 将所有爬取信息合并到一个列表中,作为函数返回值返回,便于后期的信息保存。
    datas = []
    datas.append(titles)
    datas.append(times)
    datas.append(contents)
    datas.append(urls)
    datas.append(sources)
    return datas


# 4、保存信息
def SaveFiles(datas, page):
    filename = '第%s页内容.csv' % page
    with open(filename, "w", encoding="utf-8") as fp:
        # 写入文件的标题
        title = ["标题", "时间", "内容", "URL", "来源"]
        fp.write(str(title[0]))
        fp.write(",")
        fp.write(str(title[1]))
        fp.write(",")
        fp.write(str(title[2]))
        fp.write(",")
        fp.write(str(title[3]))
        fp.write(",")
        fp.write(str(title[4]))
        fp.write("\n")

        # 写入需要保存的信息
        for i in range(len(datas[0])):
            fp.write(str(datas[0][i]))
            fp.write(",")
            fp.write(str(datas[1][i]))
            fp.write(",")
            fp.write(str(datas[2][i]))
            fp.write(",")
            fp.write(str(datas[3][i]))
            fp.write(",")
            fp.write(str(datas[4][i]))
            fp.write("\n")
        fp.close()


# 5、写主函数内容
def Main():
    for page in pages:
            url = origin_url % page
            res_text = Request(url)
            dicts = xpath_dict
            res_datas = ParsePage(res_text, dicts)
            SaveFiles(res_datas, page)
            print("第%s页解析完毕!" % page)
            time.sleep(1)
    print("全部页面解析完成!")


# 6、调用主函数
if __name__ == '__main__':
    Main()













  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值