from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import copy
import re
import pandas as pd
import openpyxl
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import time
import numpy as np
There was an error managing chromedriver (error sending request for url (https://googlechromelabs.github.io/chrome-for-testing/known-good-versions-with-downloads.json)); using driver found in the cache
必应
爬虫部分
deal函数用于传入一个html对象,并对这个对象做处理,提取信息
defdeal(item):
tmp={}
position=str(item.location['y']-item.size['height'])
driver.execute_script("window.scrollTo(0,"+position+")")
more_button=item.find_element(by=By.CLASS_NAME,value='Button.ContentItem-more.FEfUrdfMIKpQDJDqkjte.Button--plain.fEPKGkUK5jyc4fUuT0QP')
more_button.click()
time.sleep(3)####try:
tmp['title']=item.find_element(by=By.CLASS_NAME,value='ContentItem-title').text
except Exception as e:
tmp['title']=str(item.text).split('\n')[0]try:
tmp['author']=item.find_element(by=By.CLASS_NAME,value='UserLink.AuthorInfo-name').text
except Exception as e:
tmp['author']=Nonetry:
tmp['content']=re.sub('\s+','',str(item.find_element(by=By.CLASS_NAME,value='RichContent-inner').text))except Exception as e:
tmp['content']=re.sub('\s+','',str(''.join(str(item.text).split('\n')[1:])))try:
tmp['time']=str(item.find_element(by=By.CLASS_NAME,value='ContentItem-time').text).replace('发布于','')
tmp['time']=tmp['time'].replace('编辑于','')except Exception as e:
tmp['time']='2024-01-01 00:00'try:
tmp['up_count']=int(re.search(r'\d+',str(item.find_element(by=By.CLASS_NAME,value='Button.VoteButton.VoteButton--up.FEfUrdfMIKpQDJDqkjte').text)).group())except Exception as e:
tmp['up_count']=0####
down_position=str(item.location['y'])
driver.execute_script("window.scrollTo(0,"+str(down_position)+")")
time.sleep(1)
less_button=item.find_element(by=By.CLASS_NAME,value='RichContent-collapsedText')
less_button.click()return tmp
流水线
传入一个字符串代表要输入的搜索内容
运行此函数可以直接将搜索的所有内容保存至表格文件中
defdeal_search(content:str):
df_ori=pd.read_excel('result.xlsx')
df_ori=df_ori[df_ori.columns[1:]]
articles=[]
search_input=driver.find_element(by=By.XPATH,value="/html/body/div[1]/div/div[2]/header/div[1]/div[1]/div/form/div/div/label/input")
search_button=driver.find_element(by=By.XPATH,value="/html/body/div[1]/div/div[2]/header/div[1]/div[1]/div/form/div/div/label/button")
search_input.send_keys(content)
search_button.click()
time.sleep(5)
root=driver.find_element(by=By.ID,value='root')
root.click()for i inrange(25):
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
time.sleep(0.5)
time.sleep(3)
re_list=driver.find_elements(by=By.CLASS_NAME,value="List-item")for item in re_list:try:
articles.append(deal(item))except Exception as e:continue
df=pd.DataFrame(articles)
df=pd.concat([df_ori,df],axis=0)
df.to_excel('result.xlsx')
search_input=driver.find_element(by=By.XPATH,value="/html/body/div[1]/div/div[2]/header/div[1]/div[1]/div/form/div/div/label/input")for i inrange(len(content)):
search_input.send_keys(Keys.BACK_SPACE)
for ind inrange(len(contents)):try:
deal_search(contents[ind])except Exception as e:print(contents[ind]+" failed")
ind-=1
driver.refresh()
time.sleep(10)
search_input=driver.find_element(by=By.XPATH,value="/html/body/div[1]/div/div[2]/header/div[1]/div[1]/div/form/div/div/label/input")for i inrange(10):
search_input.send_keys(Keys.BACK_SPACE)
time.sleep(5)
df=pd.read_excel('./result.xlsx')
df=df[df.columns[1:]]
bools=df.duplicated(subset=None, keep='first')
df_unique=df[[not i for i in bools]].reset_index(drop=True)
question=[Trueif"?"in i or"?"in i elseFalsefor i in df_unique['title']]
df_articles=df_unique[[not i for i in question]].reset_index(drop=True)
df_answers=df_unique[question].reset_index(drop=True)
df_answers.to_excel("./answers.xlsx")
df_articles.to_excel("./articles.xlsx")
df_unique.to_excel('result_unique.xlsx')print(bools)