selenium 实现头条关键字爬取

from selenium import webdriver
# 设立米嗯
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from lxml import html
import time
from urllib import request
import re
import csv

CONTENT =[]

def page_sourc(url):
    driver_path = r'D:\Desktop\chromedriver_win32\chromedriver.exe'
    driver = webdriver.Chrome(executable_path=driver_path)
    driver.get(url)

    inputTag = driver.find_element_by_xpath("//input[@class='tt-input__inner']")
    inputTag.send_keys("旅游被坑")

    current_window = driver.current_window_handle

    WebDriverWait(driver=driver,timeout=10).until(
        EC.presence_of_element_located((By.XPATH,"//input[@class='tt-input__inner']"))
    )
    subBtn =driver.find_element_by_xpath("//button[@class='tt-button tt-button--default']")
    subBtn.click()

    all_window = driver.window_handles
    for window in all_window:
        if window != current_window:
            driver.switch_to.window(window)
    current_window = driver.current_window_handle

    WebDriverWait(driver=driver,timeout=10).until(
        EC.presence_of_element_located((By.XPATH,"//div[@class='sections']//div[@class='title-box']/a"))
    )
    time.sleep(2)
    for i in range(10):
        driver.execute_script(
            "window.scrollTo(0, document.body.scrollHeight); var lenOfPage=document.body.scrollHeight; return lenOfPage;")
        time.sleep(3)
        source = driver.page_source
        sprider(source)


def sprider(source):
    etree = html.etree
    text = etree.HTML(source)

    links = text.xpath("//div[@class='sections']//div[@class='title-box']/a/@href")
    links = list(map(lambda x : 'https://www.toutiao.com'+ x ,links))

    titles = re.findall(r'<span class="J_title".*?>(.*?)</span>',source,re.DOTALL)
    contonts = []
    for title in titles:
        contont = re.sub(r'<.*?>','',title)
        contonts.append(contont.strip())

    for value in zip(links, contonts):
        links,contonts = value
        content = {
            '标题': contonts,
            '网址':  links
        }
        # print(poem)
        CONTENT.append(content)

def wrte_csv(CONTENT):
    headers = ['标题','网址']

    with open('D://旅游被坑.csv','a',newline='') as fp:
       writer = csv.DictWriter(fp,headers)
       writer.writeheader()
       writer.writerows(CONTENT)



def main():
    url='https://www.toutiao.com/'
    page_sourc(url)
    wrte_csv(CONTENT)


if __name__ == '__main__':
    main()

1.因为头条也是爬取其他地方的资源所以想要去去除重复的资源不可能
2.我把结果保存在d盘csv文件里面的

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值