from selenium import webdriver
# 设立米嗯
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from lxml import html
import time
from urllib import request
import re
import csv
CONTENT =[]
def page_sourc(url):
driver_path = r'D:\Desktop\chromedriver_win32\chromedriver.exe'
driver = webdriver.Chrome(executable_path=driver_path)
driver.get(url)
inputTag = driver.find_element_by_xpath("//input[@class='tt-input__inner']")
inputTag.send_keys("旅游被坑")
current_window = driver.current_window_handle
WebDriverWait(driver=driver,timeout=10).until(
EC.presence_of_element_located((By.XPATH,"//input[@class='tt-input__inner']"))
)
subBtn =driver.find_element_by_xpath("//button[@class='tt-button tt-button--default']")
subBtn.click()
all_window = driver.window_handles
for window in all_window:
if window != current_window:
driver.switch_to.window(window)
current_window = driver.current_window_handle
WebDriverWait(driver=driver,timeout=10).until(
EC.presence_of_element_located((By.XPATH,"//div[@class='sections']//div[@class='title-box']/a"))
)
time.sleep(2)
for i in range(10):
driver.execute_script(
"window.scrollTo(0, document.body.scrollHeight); var lenOfPage=document.body.scrollHeight; return lenOfPage;")
time.sleep(3)
source = driver.page_source
sprider(source)
def sprider(source):
etree = html.etree
text = etree.HTML(source)
links = text.xpath("//div[@class='sections']//div[@class='title-box']/a/@href")
links = list(map(lambda x : 'https://www.toutiao.com'+ x ,links))
titles = re.findall(r'<span class="J_title".*?>(.*?)</span>',source,re.DOTALL)
contonts = []
for title in titles:
contont = re.sub(r'<.*?>','',title)
contonts.append(contont.strip())
for value in zip(links, contonts):
links,contonts = value
content = {
'标题': contonts,
'网址': links
}
# print(poem)
CONTENT.append(content)
def wrte_csv(CONTENT):
headers = ['标题','网址']
with open('D://旅游被坑.csv','a',newline='') as fp:
writer = csv.DictWriter(fp,headers)
writer.writeheader()
writer.writerows(CONTENT)
def main():
url='https://www.toutiao.com/'
page_sourc(url)
wrte_csv(CONTENT)
if __name__ == '__main__':
main()
1.因为头条也是爬取其他地方的资源所以想要去去除重复的资源不可能
2.我把结果保存在d盘csv文件里面的