【爬虫】根据关键词自动搜索并爬取结果

该博客介绍了两种自动化爬虫策略:一种针对可获取页数的网页,先爬取总页数再爬取每页数据;另一种针对无法直接获取页数的网页,通过不断换页进行爬取。使用了Selenium库来模拟浏览器行为,同时对爬取的数据如产品名称、公司名、电话等进行了存储,并导出为CSV文件。
摘要由CSDN通过智能技术生成

根据关键词自动搜索并爬取网页的信息
网页有两种情况:可以直接获取页数的和不可以直接获取页数的;
两种情况可以采取不同的方法:

情况一:先爬取页数,再爬取每页的数据

# coding=utf-8
import pandas as pd
import urllib
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import csv
import re
import random

option = webdriver.ChromeOptions()
option.add_argument("headless")
# option.binary_location = r"...\chrome.exe"
option.add_argument('blink-settings=imagesEnabled=false')
driver = webdriver.Chrome(executable_path=r"...\chromedriver.exe"
                          , options=option)
head_url = "部分的头部URL+key="
keywords_all = []
keywords = keywords_all[410:444]

keyword_list = []
product_name_list = []
company_name_list = []
company_url_list = []
phone_list = []


def PageNumber(keyword):
    wd = urllib.parse.quote(keyword.encode('gb2312'))
    turn_url = head_url + wd + ';use_cas=0;f=pclist;p=0'
    driver.get(turn_url)
    # print(driver.page_source)
    time.sleep(random.randint(1,3))
    try:
        source = driver.find_element(By.XPATH
                                        ,"//div[@class='gys']/dl/dt/span").text
        reg = re.findall(r".*有(.*)家", source)
        page_number = int(reg[0])
        print("共有",page_number,"条数据")
        return page_number
    except:
        return -1

def GetResult(keyword, page):
    wd = urllib.parse.quote(keyword.encode('gb2312'))
    turn_url = head_url + wd + ';use_cas=0;f=pclist;p=' + str(page)
    print(turn_url)
    try:
        driver.get(turn_url)
        time.sleep(random.randint(2,4))
        list = driver.find_elements(By.XPATH
                                , "//div[@class='gys']/dl/dd/form")
        for l in list:
            company = l.find_element(By.XPATH, "./table/tbody/tr/td/a").text
            print(company)
            company_name_list.append(company)
            company_url = l.find_element(By.XPATH,"./table/tbody/tr/td/a[1]").get_attribute('href')
            print(company_url)
            company_url_list.append(company_url)
            phone = l.find_element(By.XPATH, "./table/tbody/tr[2]/td[2]").text
            print(phone)
            phone_list.append(phone)
            print(keyword)
            keyword_list.append(keyword)
    except:
        print('get不到页面')
for i in keywords:
    this_page = 0
    page_number = int((PageNumber(keyword=i))/10)
    if page_number == 0:
        try:
            GetResult(keyword=i, page=0)
        except:
            continue
    elif page_number == -1:
        print(i,'无数据')
    else:
        for p in range(0,page_number):
            try:
                GetResult(keyword=i, page=p)
            except:
                continue
data_list = []
for a, b, c, d in zip(keyword_list, company_name_list, company_url_list, phone_list):
    x = {}
    x['keyword'] = a
    x['company_name'] = b
    x['company_url'] = c
    x['phone'] = d
    data_list.append(x)
# print(data_list)
with open(r"###.csv", 'w', newline='', encoding='UTF-8') as f_c_csv:
    writer = csv.writer(f_c_csv)
    writer.writerow(['keyword' ,'company_name', 'company_url', 'phone'])
    for nl in data_list:
        writer.writerow(nl.values())
print("写入完成!")

情况二:无法爬取到页码数,只能换页爬取的

# coding=utf-8
import urllib
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import csv
import random
import pandas as pd

option = webdriver.ChromeOptions()
option.add_argument("headless")
# option.binary_location = r"...\chrome.exe"
option.add_argument('blink-settings=imagesEnabled=false')
driver = webdriver.Chrome(executable_path=r"...\chromedriver.exe, options=option)
head_url = "部分头url+keyword="
keywords_all = []
keywords = keywords_all[400:444]

keyword_list = []
product_name_list = []
company_name_list = []
company_url_list = []
mobilephone_list = []
telephone_list = []

def NextPage(keyword, page):
    wd = urllib.parse.quote(keyword.encode('utf-8'))
    if page == 0:
        turn_url = head_url + wd
    else:
        turn_url = head_url + wd + "&p="+ str(page)
    print(turn_url)
    driver.get(turn_url)
    time.sleep(random.randint(1,3))
    list = driver.find_elements(By.XPATH
                                    ,"//div[@class='lc-grid-list']//div[@class='container']//div[@class='grid-body']//div[@class='lc-main']//div[@class='lc-products-wrap']//div[@class='pro-item clearfix ']")
    return len(list)

def GetResult(keyword, page):
    wd = urllib.parse.quote(keyword.encode('utf-8'))
    if page == 0:
        turn_url = head_url + wd
    else:
        turn_url = head_url + wd + "&p=" + str(page)
    driver.get(turn_url)
    time.sleep(random.randint(3,5))
    try:
        list = driver.find_elements(By.XPATH
                                    , "//div[@class='lc-grid-list']//div[@class='container']//div[@class='grid-body']//div[@class='lc-main']//div[@class='lc-products-wrap']//div[@class='pro-item clearfix ']")
        for l in list:
            product_name = l.find_element(By.XPATH, "./div[@class='pro-info']/div[@class='intro-box']/div[@class='tt']/a").text
            print(product_name)
            product_name_list.append(product_name)
            try:
                telephone = l.find_element(By.XPATH, "./div[@class='pro-info']/div[@class='basic-box']/div[@class='info']/dl/dd[2]").text
                print(telephone)
                telephone_list.append(telephone)
                mobilephone = l.find_element(By.XPATH,
                                           "./div[@class='pro-info']/div[@class='basic-box']/div[@class='info']/dl/dd[3]").text
                print(mobilephone)
                mobilephone_list.append(mobilephone)
            except:
                continue
            company = l.find_element(By.XPATH,
                                       "./div[@class='pro-info']/div[@class='basic-box']/div[@class='title']/em").text
            print(company)
            company_name_list.append(company)
            for link in l.find_elements(By.XPATH,"./div[@class='pro-info']/div[@class='basic-box']/div[@class='title']/em/a"):
                company_url = link.get_attribute('href')
                print(company_url)
                company_url_list.append(company_url)
            print(keyword)
            keyword_list.append(keyword)
    except:
        print("爬取失败")

for i in keywords:
    this_page = 0
    while NextPage(keyword = i, page = this_page) > 19:
        GetResult(keyword=i, page=this_page)
        this_page = this_page + 1
    if NextPage(keyword = i, page = this_page) < 20:
        GetResult(keyword=i, page=this_page)

data_list = []
for a, b, c, d, e, f in zip(keyword_list, product_name_list, company_name_list, company_url_list, mobilephone_list, telephone_list):
    x = {}
    x['keyword'] = a
    x['product_name'] = b
    x['company_name'] = c
    x['company_url'] = d
    x['mobilephone'] = e
    x['telephone'] = f
    data_list.append(x)
# print(data_list)

with open("###.csv", 'w', newline='', encoding='UTF-8') as f_c_csv:
    writer = csv.writer(f_c_csv)
    writer.writerow(['keyword', 'product_name','company_name', 'company_url', 'mobilephone', 'telephone'])
    for nl in data_list:
        writer.writerow(nl.values())
print("写入完成!")
  • 0
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值