elenium+chrome抓动态网页
抓取https://m.maigoo.com/brand/search/?brandlevel=2723页面中的商标名字
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pyquery import PyQuery as pq
from selenium.webdriver.chrome.options import Options
import time
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
browser = webdriver.Chrome(chrome_options=chrome_options)
wait = WebDriverWait(browser, 10)
# https://m.maigoo.com/brand/search/?&catid=7&brandlevel=2729&initial=A&areaid=4639
# 不断点击 '加载更多'
def search(url='https://m.maigoo.com/brand/search/?brandlevel=2723'):
i = url.split('=')[-1]
count = 0
try:
browser.get(url)
while True:
if count > 100: break
submit = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, '#result > div.morebox2 > a.morebtn.jiamore'))
)
time.sleep(0.5)
count += 1
submit.click()
except:
pass
finally:
get_product(i, count)
def get_product(i, count):
# 获取网页源代码
html = browser.page_source
doc = pq(html)
# # 获取全部商标
items = doc('#result .load_block').items()
# 输出商标名
with open('result/content.txt', 'a', encoding='utf-8') as f:
for item in items:
product = item.find('.ttl .scont').text()
print('%s %d %s' % (i, count, product))
for i in product.split(' '):
if i == '': continue
f.write(i + '\n')
if __name__ == '__main__':
search()