本文是一个爬虫的综合应用实例,使用了Selenium、用户身份登录、接口爬取、url跳转、excel保存数据等技术。
import time
import json
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import StaleElementReferenceException
import requests
import urllib.parse
import xlwt
class Gjh():
def __init__(self, index, name):
self.categoryIndex = index
self.categoryName = name
options = webdriver.ChromeOptions()
# 不加载图片,加快访问速度
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
# 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
options.add_experimental_option('excludeSwitches', ['enable-automation'])
# 添加本地代理
# options.add_argument("--proxy--server=127.0.0.1:8080")
# 添加UA
ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
options.add_argument('user-agent=' + ua)
self.driver = webdriver.Chrome(options=options)
def openPage(self):
# 这里是一个假的url,需访问实际地址
url = 'https://www.test.com/'
self.driver.maximize_window()
wait = WebDriverWait(self.driver, 2)
self.driver.get(url)
time.sleep(2)
# time.sleep(2)
user = self.driver.find_element_by_css_selector("div.userName input")
user.send_keys(Keys.CONTROL, 'a')
user.send_keys(Keys.DELETE)
user.send_keys('13270538237')
passwd = self.driver.find_element_by_css_selector("div.pass-fa input")
passwd.send_keys(Keys.CONTROL, 'a')
passwd.send_keys(Keys.DELETE)
passwd.send_keys('booab20')
# time.sleep(20)
self.driver.find_element_by_css_selector('div.agreement input').click()
time.sleep(2)
self.driver.find_element_by_xpath('/html/body/div[1]/div/div[2]/div[2]/div/div[2]/div/div[3]/div[2]/button').click()
time.sleep(2)
self.driver.get("https://www.test.com/hello")
time.sleep(2)
categories = self.driver.find_elements_by_css_selector('div.left-nav a')
# print(categories)
csize = len(categories)
# 输出有多少个行业
print(csize)
# 点击第self.categoryIndex个行业
categories[self.categoryIndex].click()
time.sleep(2)
self.driver.find_elements_by_css_selector('ul.tabs-bar li')[0].click()
time.sleep(2)
company_list = self.getCompanyUrls()
result = []
try:
self.getAllData(result, company_list)
finally:
# self.printData(result)
self.saveExcel(result)
def getCompanyUrls(self):
# 从url中解析出categoryId
current_url = self.driver.current_url
print(current_url)
params = urllib.parse.urlparse(current_url)
qq = urllib.parse.parse_qs(params.query)
print(qq)
# categoryId 是个数组,但是只有一个值
categoryId = qq['categoryId']
print(categoryId)
size = 15
page = 0
# 先取出全部的url放到company_list中,然后依次迭代爬取每个页面的数据
company_list = []
totalPage = self.getOnePageCompanyUrls(categoryId, page, size, company_list)
if totalPage > 1:
r = range(1, totalPage + 1)
print(r)
for i in r:
time.sleep(1)
self.getOnePageCompanyUrls(categoryId, i, size, company_list)
llentgh = len(company_list)
print("llentgh===" + str(llentgh))
return company_list
def getOnePageCompanyUrls(self, categoryIds, page, size, company_list):
print("page num === " + str(page))
categoryId = categoryIds[0]
link = 'https://gateway.cantonfair.org.cn/Api/ESAPI/company/classify-v2?page=' + str(page) + '&size=' + str(size) + '&type=undefined'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
'Content-Type': 'application/json;charset=UTF-8'
,'Referer': 'https://ex.cantonfair.org.cn/mainsite/zh?categoryId=' + categoryId + '&_ga=2.102673793.1446854882.1593008478-19231745.1593008478'
,'Origin': 'https://ex.cantonfair.org.cn'}
categoryId = json.dumps(categoryIds)
ccdata = '{"page":' + str(page) + ',"size":' + str(size) + ',"searchKeys":["companyNameCN","companyNameEN"],"isSearch":false,"selectAndMap":{"categoryId":' + categoryId + ',"boothAreaSearch":[],"boothNumber":[]},"orderModel":{"order":"asc","properties":[]},"selectOrMap":{"isPovertyAlleviation":[],"isFirstJoin":[],"isContinuousJoin":[],"isBrand":[],"exhibitorType":[],"productTrait":[],"isCfWinner":[],"tradeTypes":[],"isGreenAward":[],"isInvitationAward":[]},"searchValue":""}'
print(ccdata)
r = requests.post(link, headers=headers, data=ccdata)
print(r.text)
resultJson = json.loads(r.text)
returnObj = resultJson.get('returnObj')
print(returnObj)
companies = returnObj.get('list')
for company in companies:
company_list.append(company['companyHrefCN'])
totalPage = returnObj.get('page').get('totalPage')
print("totalPage===" + str(totalPage))
print("company_list size===" + str(len(company_list)))
return totalPage
def getAllData(self, result, comany_list):
i = 0
for exhibitorLink in comany_list:
print("exhibitorLink==**********==" + str(i))
# href = exhibitor.get_attribute('href')
print(exhibitorLink)
# 直接跳转到展商详情页
self.driver.get(exhibitorLink + '/company')
try:
self.getAttr(result)
except :
# 如果出现异常则重试
print('try again, num is====' + str(i))
time.sleep(10)
try:
self.getAttr(result)
except :
# 如果出现异常再次重试
print('try reagain, num is====' + str(i))
time.sleep(50)
self.getAttr(result)
i = i+1
def getAttr(self, result):
time.sleep(1)
# 滚动到底部
self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
time.sleep(1)
# 点击查看联系方式
self.driver.find_element_by_css_selector('div.ex-60__contact-forbidden a.ex-60__contact-view').click()
time.sleep(1)
items = self.driver.find_elements_by_css_selector('div.ex-60__inner div.cell')
one = {}
for item in items:
fields = item.find_elements_by_css_selector('div.cell-item')
for field in fields:
label = field.find_element_by_css_selector('div.cell-label').text
value = field.find_element_by_css_selector('div.cell-value').text
# print(label + "====" + value)
one[label] = value
result.append(one)
def saveExcel(self, result):
# 创建一个workbook 设置编码
workbook = xlwt.Workbook(encoding='utf-8')
# 创建一个worksheet
worksheet = workbook.add_sheet(self.categoryName)
# 写入excel, 参数对应 行, 列, 值
rowIndex = 0
# 第一行是标题
worksheet.write(rowIndex, 0, label='企业名称')
worksheet.write(rowIndex, 1, label='企业类型')
worksheet.write(rowIndex, 2, label='成立年份')
worksheet.write(rowIndex, 3, label='注册资本')
worksheet.write(rowIndex, 4, label='企业规模')
worksheet.write(rowIndex, 5, label='主要目标客户')
worksheet.write(rowIndex, 6, label='主营展品')
worksheet.write(rowIndex, 7, label='地址')
worksheet.write(rowIndex, 8, label='所在地区')
worksheet.write(rowIndex, 9, label='网址')
worksheet.write(rowIndex, 10, label='业务联系人')
worksheet.write(rowIndex, 11, label='邮箱')
worksheet.write(rowIndex, 12, label='电话')
worksheet.write(rowIndex, 13, label='手机')
worksheet.write(rowIndex, 14, label='传真')
worksheet.write(rowIndex, 15, label='邮编')
rowIndex = rowIndex + 1
for row in result:
worksheet.write(rowIndex, 0, label=row.get('企业名称:'))
worksheet.write(rowIndex, 1, label=row.get('企业类型:'))
worksheet.write(rowIndex, 2, label=row.get('成立年份:'))
worksheet.write(rowIndex, 3, label=row.get('注册资本:'))
worksheet.write(rowIndex, 4, label=row.get('企业规模:'))
worksheet.write(rowIndex, 5, label=row.get('主要目标客户:'))
worksheet.write(rowIndex, 6, label=row.get('主营展品:'))
worksheet.write(rowIndex, 7, label=row.get('地址:'))
worksheet.write(rowIndex, 8, label=row.get('所在地区:'))
worksheet.write(rowIndex, 9, label=row.get('网址:'))
worksheet.write(rowIndex, 10, label=row.get('业务联系人'))
worksheet.write(rowIndex, 11, label=row.get('邮箱'))
worksheet.write(rowIndex, 12, label=row.get('电话'))
worksheet.write(rowIndex, 13, label=row.get('手机'))
worksheet.write(rowIndex, 14, label=row.get('传真'))
worksheet.write(rowIndex, 15, label=row.get('邮编'))
rowIndex = rowIndex + 1
# 保存
filePath = 'd:\\data_' + str(self.categoryIndex) + '.xls'
workbook.save(filePath)
if __name__ == '__main__':
# 第11个行业分类
g = Gjh(11, '纺织服装')
g.openPage()
本文内容到此结束。