爬取百度口碑企业标签分类

爬虫测试代码,主要通过selenium,bs4等模块完成爬虫。

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import datetime


alist=[]

browser = webdriver.Chrome()
browser.get("https://koubei.baidu.com/rank?tid=1702")
browser.maximize_window()
time.sleep(2)
for i in range(1,11):
    a=browser.find_element_by_xpath('//*[@id="app"]/div/div[2]/div[3]/div['+str(i)+']/a').get_attribute('href')
    alist.append(a)
    start_html = requests.get(a).content
    soup = BeautifulSoup(start_html, "lxml")
    companytag = ""
    for comname in soup.find_all("span", class_=re.compile("compname-txt")):
        print(comname.text)
    for hy in soup.find_all("p", class_=re.compile("right trade")):
        print(hy.text)
    for address in soup.find_all("p", class_=re.compile("right businessaddr")):
        print(address.text)
for i in range(1, 470):
    time.sleep(2)
    page=9
    if(i>3):
        page=10
    if(i>4):
        page=11
    browser.find_element_by_xpath('//*[@id="app"]/div/div[2]/div[3]/ul/li['+str(page)+']').click()
    for i in range(1, 11):
        try:
            a = browser.find_element_by_xpath('//*[@id="app"]/div/div[2]/div[3]/div[' + str(i) + ']/a').get_attribute(
                'href')
            alist.append(a)
            start_html = requests.get(a).content
            soup = BeautifulSoup(start_html, "lxml")
            companytag = ""
            for comname in soup.find_all("span", class_=re.compile("compname-txt")):
                print(comname.text)
            for hy in soup.find_all("p", class_=re.compile("right trade")):
                print(hy.text)
            for address in soup.find_all("p", class_=re.compile("right businessaddr")):
                print(address.text)
        except:
            continue

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值