千里马招标爬虫

满天繁星_

已于 2023-12-31 09:41:09 修改

阅读量758

点赞数 2

分类专栏：笔记文章标签： python

于 2022-03-15 17:34:11 首次发布

本文链接：https://blog.csdn.net/weixin_51081062/article/details/123507853

版权

笔记专栏收录该内容

15 篇文章 0 订阅

订阅专栏

信息采集（30条）
http://www.qian$$$lima.com/zbgg/
1.可以实现1-5页面的灵活爬取
2.本地txt存储

import requests
from lxml import etree


class WYSpider:

    def __init__(self, p):
        # http://www.qian$$$lima.com/zbgg/p1
        self.host = f'http://www.qian$$$lima.com/zbgg/p{p}'
        self.file_name = f"p{p}"
        self.head = {
            'User-Agent': '用自己的'
        }

    def getHtml(self):
        response = requests.get(url=self.host, headers=self.head)
        response.encoding = 'utf-8'
        if response.status_code == 200:
            return response.content.decode()  # !!!
        else:
            print('请求失败，请重新输入')
            self.getHtml()

    def parseHtml(self, content):
        a = []
        href = []
        et = etree.HTML(content)  # xpath解析
        a_s = et.xpath('//*[@id="__layout"]/div/div[2]/div/div[2]/div[1]/div[1]'
                       '/div[2]/div/div/div/div/div/a[2]')
        print("---", a_s)
        for each in a_s:
            a.append(each.text)
            # a.append(each.text.encode('utf-8').decode('unicode'))  # wrong
        print(a)  # 乱码 ???
        href_s = et.xpath('//*[@id="__layout"]/div/div[2]/div/div[2]/div[1]/div[1]'
                          '/div[2]/div/div/div/div/div/a[2]/@href')
        print(href_s)
        for each in href_s:
            href.append(each)
        return zip(a, href)

    def save(self, a_and_href):
        file = open(self.file_name + ".txt", mode='w', encoding='utf-8')
        for k, v in a_and_href:
            file.write(k + ' : ' + v + '\n')
        file.flush()
        file.close()
        print('END!', file)


for i in range(1, 6):
    app = WYSpider(i)
    app.save(app.parseHtml(app.getHtml()))