from selenium.webdriver import Chrome
from lxml import etree
import time
import requests
import json
class CSDN_Spider():
def __init__(self):
self.url = "https://www.csdn.net/nav/python"
self.headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
}
self.browser = Chrome(executable_path="D:/chromedriver_win32/chromedriver.exe")
def get_all_articles_url(self):
items = []
self.browser.get(self.url)
for i in range(100):
text = self.browser.page_source
html = etree.HTML(text)
tag_a_url = html.xpath("//div[@class='title']//a/@href")
items += tag_a_url
js = "var q=document.documentElement.scrollTop=100000"
self.browser.execute_script(js)
time.sleep(1)
self.browser.close()
return list(set(items))
def get_detail_page_text(self, url):
response = requests.get(url, headers=self.headers)
if response.status_code == 200:
return response.text
else:
return "request not successfully"
def parse_detail_page(self, text):
html = etree.HTML(text)
item = {}
try:
item["name"] = html.xpath("//a[@class='follow-nickName ']/text()")[0]
item["code_age"] = html.xpath("//span[@class='personal-home-page personal-home-years']/text()")[0]
item["authentication"] = html.xpath("//a[@class='personal-home-certification']/@title")[0]
digital_data = html.xpath("//dl[@class='text-center']//span[@class='count']/text()")
item["original"] = digital_data[0]
item["week_rank"] = digital_data[1]
item["all_rank"] = digital_data[2]
item["vivstor_num"] = digital_data[3]
item["integral"] = digital_data[4]
item["fans_num"] = digital_data[5]
item["be_praised_num"] = digital_data[6]
item["review_num"] = digital_data[7]
item["collection"] = digital_data[8]
except:
print("this item is erroneous")
item["name"] = "invalidity"
return item
def save_data(self, item):
with open("./data/csdn_authors.json", "a", encoding="utf-8") as fp:
json.dump(item, fp, ensure_ascii=False)
fp.write("\n")
def main(self):
articles_urls = self.get_all_articles_url()
for url in articles_urls:
text = self.get_detail_page_text(url)
if text != "request not successfully":
item = self.parse_detail_page(text)
self.save_data(item)
print(item["name"] + "save into local document of json successfully")
if __name__ == '__main__':
cs = CSDN_Spider()
cs.main()
爬取结果如下: