获取百度搜索的url
方法一:使用selenium
技术点:selenium+requests+pandas
import time
import asyncio
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from lxml import etree
import time
from selenium.webdriver.common.keys import Keys
import requests
import pandas as pd
import random
from user_agent import user_agent
headers = {
'user-agent': random.choice(user_agent),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
}
def filter_url(url_list):
new_list = []
for url in url_list:
baidu_url = requests.get(url=url, headers=headers, allow_redirects=False)
real_url = baidu_url.headers['Location'] # 得到网页真正地址
if real_url.startswith('https'):
new_list.append(real_url)
return new_list
def get_url(html):
url_list=html.xpath('//*[@id]/h3/a/@href')
header_url_list=html.xpath('//*[@id]/div/article/section/div/div[2]/a[1]/@href')
return header_url_list,url_list
def get_html(word):
chromedriver = r'D:\SoftWare\chromedriver.exe'
chrome_options = Options()
chrome_options.add_argument('--headless')
driver = webdriver.Chrome(chromedriver, chrome_options=chrome_options)
driver.get("http://www.baidu.com")
time.sleep(2)
driver.find_element_by_id('kw').send_keys(word)
driver.find_element_by_id('kw').send_keys(Keys.ENTER)
time.sleep(2)
page_htm