selenium + xpath爬取csdn关于python的博文博主信息

from selenium.webdriver import Chrome
from lxml import etree
import time
import requests
import json

class CSDN_Spider():

    def __init__(self):
        self.url = "https://www.csdn.net/nav/python"
        self.headers = {
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
        }
        self.browser = Chrome(executable_path="D:/chromedriver_win32/chromedriver.exe")

    def get_all_articles_url(self):
        items = []
        self.browser.get(self.url)
        for i in range(100):
            text = self.browser.page_source
            html = etree.HTML(text)
            tag_a_url = html.xpath("//div[@class='title']//a/@href")
            items += tag_a_url
            js = "var q=document.documentElement.scrollTop=100000"
            self.browser.execute_script(js)
            time.sleep(1)
        self.browser.close()
        return list(set(items))

    def get_detail_page_text(self, url):
        response = requests.get(url, headers=self.headers)
        if response.status_code == 200:
            return response.text
        else:
            return "request not successfully"

    def parse_detail_page(self, text):
        html = etree.HTML(text)
        item = {}
        try:
            item["name"] = html.xpath("//a[@class='follow-nickName ']/text()")[0]
            item["code_age"] = html.xpath("//span[@class='personal-home-page personal-home-years']/text()")[0]
            item["authentication"] = html.xpath("//a[@class='personal-home-certification']/@title")[0]
            digital_data = html.xpath("//dl[@class='text-center']//span[@class='count']/text()")
            item["original"] = digital_data[0]
            item["week_rank"] = digital_data[1]
            item["all_rank"] = digital_data[2]
            item["vivstor_num"] = digital_data[3]
            item["integral"] = digital_data[4]
            item["fans_num"] = digital_data[5]
            item["be_praised_num"] = digital_data[6]
            item["review_num"] = digital_data[7]
            item["collection"] = digital_data[8]
        except:
            print("this item is erroneous")
            item["name"] = "invalidity"
        return item

    def save_data(self, item):
        with open("./data/csdn_authors.json", "a", encoding="utf-8") as fp:
            json.dump(item, fp, ensure_ascii=False)
            fp.write("\n")


    def main(self):
        articles_urls = self.get_all_articles_url()
        for url in articles_urls:
            text = self.get_detail_page_text(url)
            if text != "request not successfully":
                item = self.parse_detail_page(text)
                self.save_data(item)
                print(item["name"] + "save into local document of json successfully")


if __name__ == '__main__':
    cs = CSDN_Spider()
    cs.main()

爬取结果如下:
在这里插入图片描述

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值