python爬动态网页

最新推荐文章于 2024-08-13 17:55:51 发布

@洋辣子

最新推荐文章于 2024-08-13 17:55:51 发布

阅读量144

点赞数

文章标签： python爬虫

本文链接：https://blog.csdn.net/qq_33360393/article/details/90765932

版权

elenium+chrome抓动态网页

抓取https://m.maigoo.com/brand/search/?brandlevel=2723页面中的商标名字

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pyquery import PyQuery as pq
from selenium.webdriver.chrome.options import Options
import time

chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')

browser = webdriver.Chrome(chrome_options=chrome_options)

wait = WebDriverWait(browser, 10)


# https://m.maigoo.com/brand/search/?&catid=7&brandlevel=2729&initial=A&areaid=4639
# 不断点击 '加载更多'
def search(url='https://m.maigoo.com/brand/search/?brandlevel=2723'):
    i = url.split('=')[-1]
    count = 0
    try:
        browser.get(url)
        while True:
            if count > 100: break
            submit = wait.until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, '#result > div.morebox2 > a.morebtn.jiamore'))
            )
            time.sleep(0.5)
            count += 1
            submit.click()
    except:
        pass
    finally:
        get_product(i, count)


def get_product(i, count):
    # 获取网页源代码
    html = browser.page_source
    doc = pq(html)
    # # 获取全部商标
    items = doc('#result .load_block').items()
    # 输出商标名
    with open('result/content.txt', 'a', encoding='utf-8') as f:
        for item in items:
            product = item.find('.ttl .scont').text()
            print('%s %d %s' % (i, count, product))
            for i in product.split(' '):
                if i == '': continue
                f.write(i + '\n')


if __name__ == '__main__':
    search()