热点新闻
selenium控制,找网页html的各标签
from selenium import webdriver
import time
from bs4 import BeautifulSoup
browser = webdriver.Chrome(executable_path='/home/yx/Documents/DW/spider/env/chromedriver')
browser.get('https://news.qq.com')
for i in range(1, 100):
time.sleep(0.5)
browser.execute_script("window.scrollTo(window.scrollX, %d);" % (i * 200))
html = browser.page_source
bsObj = BeautifulSoup(html, 'lxml')
jx = bsObj.find_all("div", {"class": "jx-tit"})[0].find_next_sibling().find_all("li")
print("index", ",", "title", ",", "url")
for i, jx in enumerate(jx):
try:
text = jx.find_all("img")[0]["alt"]
except:
text = jx.find_all("div", {"class": "lazyload-placeholder"})[0].text
try:
url = jx.find_all("a")[0]["href"]
except:
print(jx)
print(i + 1, ",", text, ",", url)