背景
举例目标网站:抖音带货视频排行榜_热门电商种草短视频数据 - 蝉妈妈数据 (chanmama.com)
比如,我想爬这个页面,这些选择条件下:这个列表的一些关键数据,以及每一行里面【详情】跳转到新的要给页面里面的关键数据。
实现方案
一、
Selenium-Python中文文档 自动化测试方式
存在问题:
1、每个界面元素,要逐一跳转,要跳转到视内。这样获取一条数据行,要七八秒,
2、账号不能再其他地方登录,不然会提示:重新登录
缺点:耗时非常久。
二、使用Python +urllib
其实就是使用http多线程调用,调用时候设置请求header
存在问题:
每个页面的cookie不同。 行不通。
优点:
如果cookie相同: 非常快能得到想要的数据
探讨:
不幸的是,我在爬抖音带货视频排行榜_热门电商种草短视频数据 - 蝉妈妈数据 (chanmama.com)
网站数据的时候,方案一、方案二的问题,都遇到了。
所以有没有大神指点下,更完美、高效的方案?
代码
方案一代码:
说明: 浏览器:edge , 用户名、密码自己填写
# coding:utf-8
import time
import urllib3
print("忽略SSL证书")
# https请求 忽略 SSl证书
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
import openpyxl
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
htmlLoginXPathName = r'/html/body/div[1]/div[1]/div/div[1]/div[2]/div/div[2]/div[2]/div/form/div[1]/div/div/input'
htmlLoginXPathPwd = r"/html/body/div[1]/div[1]/div/div[1]/div[2]/div/div[2]/div[2]/div/form/div[2]/div/div/input"
htmlLoginBtnXPath = r"/html/body/div[1]/div[1]/div/div[1]/div[2]/div/div[2]/div[2]/div/form/div[4]/div/button"
btnSelectAllXPathSpan = r"/html/body/div[1]/div[1]/div[2]/div/div/div/div[2]/div[1]/div/div/div/div/div[1]/div/span"
btnSelectAllXPath = r"/html/body/div[1]/div[1]/div[2]/div/div/div/div[2]/div[1]/div/div/div/div/div[1]/div"
btnYueBangXPath = r"/html/body/div[1]/div[1]/div[2]/div/div/div/div[2]/div[2]/div/div[2]/div/label[3]/input"
btnYueBangXPath = r"/html/body/div[1]/div[1]/div[2]/div/div/div/div[2]/div[2]/div/div[2]/div/label[3]/input"
btnYueBangXPathSpan = r"/html/body/div[1]/div[1]/div[2]/div/div/div/div[2]/div[2]/div/div[2]/div/label[3]/span"
btnDiggXPath = r"/html/body/div[1]/div[1]/div[2]/div/div/div/div[2]/div[3]/div[2]/label[3]/input"
btnDiggXPathSpan = r"/html/body/div[1]/div[1]/div[2]/div/div/div/div[2]/div[3]/div[2]/label[3]/span"
btnVidioDetailXPath = r"/html/body/div[1]/div[1]/div[2]/div/div/div/div[4]/div[1]/div/div[2]/table/tbody/tr[5]/td[9]/div/a[1]"
btnVidioPageMoreXPath = r"/html/body/div[1]/div[1]/div[2]/div/div/div/div[4]/div[2]/div/span[2]"
def doGetChanMaMaForChooseCondition(edgeDriver=None, strType = None, rank = 1):
try:
targetUrl = r"https://www.chanmama.com/promotionAwemeRank?big_category=&first_category=&second_category="
edgeDriver.get(targetUrl)
time.sleep(2)
btnSelectAll = WebDriverWait(edgeDriver, 5).until(EC.presence_of_element_located((By.XPATH, btnSelectAllXPath)))
edgeDriver.execute_script("arguments[0].scrollIntoView(true);", btnSelectAll)
edgeDriver.execute_script("arguments[0].click()", btnSelectAll)
time.sleep(1)
btnYueBang = WebDriverWait(edgeDriver, 5).until(EC.presence_of_element_located((By.XPATH, btnYueBangXPath)))
edgeDriver.execute_script("arguments[0].click()", btnYueBang)
time.sleep(1)
btnDigg = WebDriverWait(edgeDriver, 5).until(EC.presence_of_element_located((By.XPATH, btnDiggXPath)))
edgeDriver.execute_script("arguments[0].click()", btnDigg)
time.sleep(1)
btnPageMoreXpath = r"/html/body/div[1]/div[1]/div[2]/div/div/div/div[4]/div[2]/div/span[2]"
if rank / 50 > 0:
for a in range(0, int(rank / 50)):
# print("more page - %d"%(a) )
try:
btnPageMore = WebDriverWait(edgeDriver, 5).until(EC.presence_of_element_located((By.XPATH, btnPageMoreXpath)))
edgeDriver.execute_script("arguments[0].scrollIntoView(true);", btnPageMore)
time.sleep(1)
# edgeDriver.execute_script("arguments[0].click()", btnPageMore)
btnPageMore.click()
except Exception as ex:
print("btnPageMore 异常")
time.sleep(2)
except Exception as ex:
try:
print("错误行号:%s, 异常:%s" % (str(ex.__traceback__.tb_next.tb_lineno), ex.__str__()[:600]))
except Exception as ex2:
print("异常中,异常, %s" % (ex2.__str__()))
def doGetChanMaMaDataList(resultDataMapList=None, edgeDriver=None, totalCount = 2):
dataCount = totalCount
for i in range(98, 101 + 1):
resultLineDataMap = {}
resultLineDataMap['rank'] = i
try:
resultDataMapList.append(resultLineDataMap)
doAnalysisChanMaMa(edgeDriver, i, resultLineDataMap)
except Exception as ex:
currentUrl = edgeDriver.current_url
print("currentUrl: %s"%currentUrl)
if currentUrl.__eq__("https://www.chanmama.com/404"):
# doGetChanMaMaForChooseCondition(edgeDriver=edgeDriver, strType="rank : " + str(i) + " , 页面不存在", rank=i)
continue
print("rank: %s, 异常:%s, data: %s" % (str(i), ex.__str__(), resultLineDataMap.__str__()))
sorted(resultDataMapList, key=lambda x: x['rank'])
def doAnalysisChanMaMa(edgeDriver, i, resultLineDataMap):
xpathTd1 = r""
# 视频名称
xpathTd2_1 = r"/html/body/div[1]/div[1]/div[2]/div/div/div/div[4]/div[1]/div/div[2]/table/tbody/tr[" + str(i) + r"]/td[2]/div/div[2]/div[1]/a"
# 关联商品
xpathTd2_2 = r"/html/body/div[1]/div[1]/div[2]/div/div/div/div[4]/div[1]/div/div[2]/table/tbody/tr[" + str(i) + r"]/td[2]/div/div[2]/div[2]/div[2]/div/a"
# 作者
xpathTd3 = r"/html/body/div[1]/div[1]/div[2]/div/div/div/div[4]/div[1]/div/div[2]/table/tbody/tr[" + str(i) + r"]/td[3]/div/div[2]/div[1]/a"
# 点赞数
xpathTd4 = r"/html/body/div[1]/div[1]/div[2]/div/div/div/div[4]/div[1]/div/div[2]/table/tbody/tr[" + str(i) + r"]/td[4]/div"
# 转发数
xpathTd5 = r"/html/body/div[1]/div[1]/div[2]/div/div/div/div[4]/div[1]/div/div[2]/table/tbody/tr[" + str(i) + r"]/td[5]/div"
# 评论数
xpathTd6 = r"/html/body/div[1]/div[1]/div[2]/div/div/div/div[4]/div[1]/div/div[2]/table/tbody/tr[" + str(i) + r"]/td[6]/div"
# 详情
xpathTd7 = r"/html/body/div[1]/div[1]/div[2]/div/div/div/div[4]/div[1]/div/div[2]/table/tbody/tr[" + str(i) + r"]/td[9]/div/a[1]"
doGetChanMaMaForChooseCondition(edgeDriver=edgeDriver, strType="rank : " + str(i) + " , 解析视频,开始", rank=i)
xpathTd2_1_text = WebDriverWait(edgeDriver, 5).until(EC.presence_of_element_located((By.XPATH, xpathTd2_1)))
edgeDriver.execute_script("arguments[0].scrollIntoView(true);", xpathTd2_1_text)
resultLineDataMap['vidioTitle'] = xpathTd2_1_text.text
resultLineDataMap['vidioProductTitle'] = WebDriverWait(edgeDriver, 5).until(EC.presence_of_element_located((By.XPATH, xpathTd2_2))).text
resultLineDataMap['vidioAuthor'] = WebDriverWait(edgeDriver, 5).until(EC.presence_of_element_located((By.XPATH, xpathTd3))).text
resultLineDataMap['vidioDigg'] = WebDriverWait(edgeDriver, 5).until(EC.presence_of_element_located((By.XPATH, xpathTd4))).text
resultLineDataMap['vidioShare'] = WebDriverWait(edgeDriver, 5).until(EC.presence_of_element_located((By.XPATH, xpathTd5))).text
resultLineDataMap['vidioComment'] = WebDriverWait(edgeDriver, 5).until(EC.presence_of_element_located((By.XPATH, xpathTd6))).text
# 访问作者
# 获取作者的链接:
# https://www.chanmama.com/authorDetail/2506485403493080
authorHrefText = WebDriverWait(edgeDriver, 5).until(EC.element_to_be_clickable((By.XPATH, xpathTd3))).get_attribute("href")
edgeDriver.get(authorHrefText)
mcnXpath = r"/html/body/div[1]/div[1]/div[3]/div[1]/div/div[1]/div[2]/div[1]/div[4]/div/div[2]"
followerCountXpath = r"/html/body/div[1]/div[1]/div[3]/div[1]/div/div[1]/div[3]/div[1]/div[1]/div[2]/div[1]"
resultLineDataMap['authorMcnName'] = WebDriverWait(edgeDriver, 5).until(EC.presence_of_element_located((By.XPATH, mcnXpath))).text
resultLineDataMap['authorFollowerCount'] = WebDriverWait(edgeDriver, 5).until(EC.presence_of_element_located((By.XPATH, followerCountXpath))).text
doGetChanMaMaForChooseCondition(edgeDriver=edgeDriver, strType="rank : " + str(i) + " , 解析作者,开始", rank=i)
# 访问视频详情
btnVidioDetailUrlText = WebDriverWait(edgeDriver, 5).until(EC.element_to_be_clickable((By.XPATH, xpathTd7))).get_attribute("href")
edgeDriver.get(btnVidioDetailUrlText)
vidioPublishTimeXpath = r"/html/body/div[1]/div[1]/div[1]/div/div[2]/div[2]/div/div[1]/div[1]/div/div/div[2]/div[1]"
vidioTimeLength = r"/html/body/div[1]/div[1]/div[1]/div/div[2]/div[2]/div/div[1]/div[1]/div/div/div[2]/div[2]"
productPriceXpath = r"/html/body/div[1]/div[1]/div[1]/div/div[2]/div[3]/div[2]/div[2]/div/div[2]/div[1]/div/table/tbody/tr/td[2]/div/div[2]"
productNameXpath = r"/html/body/div[1]/div[1]/div[1]/div/div[2]/div[3]/div[2]/div[2]/div/div[2]/div[1]/div/table/tbody/tr/td[1]/div/div[2]/a"
productCatagory1Xpath = r"/html/body/div[1]/div[1]/div[1]/div/div[2]/div[3]/div[2]/div[5]/div/div[3]/div[1]/div[2]/div/div[1]"
productCatagory2Xpath = r"/html/body/div[1]/div[1]/div[1]/div/div[2]/div[3]/div[2]/div[5]/div/div[3]/div[1]/div[2]/div/div[2]"
vidioFollowerManXpath = r"/html/body/div[1]/div[1]/div[1]/div/div[2]/div[3]/div[2]/div[4]/div/div/div[2]/div/div/div/div[2]/div/span[1]"
resultLineDataMap['vidioTime'] = WebDriverWait(edgeDriver, 5).until(EC.presence_of_element_located((By.XPATH, vidioTimeLength))).text.replace("视频时长 ", "")
resultLineDataMap['vidioPublishTime'] = WebDriverWait(edgeDriver, 5).until(EC.presence_of_element_located((By.XPATH, vidioPublishTimeXpath))).text.replace("发布于 ", "")
resultLineDataMap['vidioFollowerMan'] = WebDriverWait(edgeDriver, 5).until(EC.presence_of_element_located((By.XPATH, vidioFollowerManXpath))).text.replace("男 ", "")
resultLineDataMap['vidioFollowerWomen'] = str( 100 - int(resultLineDataMap['vidioFollowerMan'].replace("%", ""))) + "%"
resultLineDataMap['vidioProductCatagory1'] = WebDriverWait(edgeDriver, 5).until(EC.presence_of_element_located((By.XPATH, productCatagory1Xpath))).text
resultLineDataMap['vidioProductCatagory2'] = WebDriverWait(edgeDriver, 5).until(EC.presence_of_element_located((By.XPATH, productCatagory2Xpath))).text
resultLineDataMap['vidioProductName'] = WebDriverWait(edgeDriver, 5).until(EC.presence_of_element_located((By.XPATH, productNameXpath))).text
resultLineDataMap['vidioProductPrice'] = WebDriverWait(edgeDriver, 5).until(EC.presence_of_element_located((By.XPATH, productPriceXpath))).text
doGetChanMaMaForChooseCondition(edgeDriver=edgeDriver, strType="rank : " + str(i) + " , 解析视频详情,开始", rank=i)
print("排名:%d , 数据:%s" % (i, resultLineDataMap.__str__()))
def doWriteDataToExcelForChanMama(filePath=None, dataList=None):
if dataList is None:
return
print("执行文件写入, data: %s"%(dataList.__str__()))
excelFile = openpyxl.load_workbook(filePath)
excelSheet = excelFile['Sheet1']
excelSheet["A" + str(2)] = "排名"
excelSheet["B" + str(2)] = "视频"
excelSheet["C" + str(2)] = "关联商品"
excelSheet["D" + str(2)] = "达人"
excelSheet["E" + str(2)] = "点赞数"
excelSheet["F" + str(2)] = "博主粉丝数"
excelSheet["G" + str(2)] = "博主是否签约MSN机构"
excelSheet["H" + str(2)] = "转发数"
excelSheet["I" + str(2)] = "评论数"
excelSheet["J" + str(2)] = "发布时间"
excelSheet["K" + str(2)] = "视频时长(秒)"
excelSheet["L" + str(2)] = "视频标题字符"
excelSheet["M" + str(2)] = "女性观众占比(%)"
excelSheet["N" + str(2)] = "男性观众占比(%)"
excelSheet["O" + str(2)] = "视频主题"
excelSheet["P" + str(2)] = "关联产品类别"
excelSheet["Q" + str(2)] = "关联产品明细"
excelSheet["R" + str(2)] = "关联商品价格(元)"
rowNum = 0
for data in dataList:
rowNum = rowNum + 1
excelSheet["A" + str(rowNum + 2)] = data["rank"]
excelSheet["B" + str(rowNum + 2)] = data.get('vidioTitle', "") #"视频"
excelSheet["C" + str(rowNum + 2)] = data.get('vidioProductTitle', "") #"关联商品"
excelSheet["D" + str(rowNum + 2)] = data.get('vidioAuthor', "") #"达人"
excelSheet["E" + str(rowNum + 2)] = data.get('vidioDigg', "") #"点赞数"
excelSheet["F" + str(rowNum + 2)] = data.get('authorFollowerCount', "") #"博主粉丝数"
excelSheet["G" + str(rowNum + 2)] = data.get('authorMcnName', "") #"博主是否签约MSN机构"
excelSheet["H" + str(rowNum + 2)] = data.get('vidioShare', "") #"转发数"
excelSheet["I" + str(rowNum + 2)] = data.get('vidioComment', "") #"评论数"
excelSheet["J" + str(rowNum + 2)] = data.get('vidioPublishTime', "") #"发布时间"
vidioTimeStr = data.get('vidioTime', "0分0秒")
fenPos = vidioTimeStr.find("分") if vidioTimeStr.find("分") > 0 else 0
intFen = 60 * (int(vidioTimeStr[: fenPos] if fenPos > 0 else "0"))
miaoPos = vidioTimeStr.find("秒") if vidioTimeStr.find("秒") > 0 else 0
intMiao = int(vidioTimeStr[(fenPos + 1 if fenPos > 0 else 0): miaoPos]) if miaoPos > 0 else 0
excelSheet["K" + str(rowNum + 2)] = intFen + intMiao #"视频时长(秒)"
excelSheet["L" + str(rowNum + 2)] = "" #"视频标题字符"
excelSheet["M" + str(rowNum + 2)] = data.get('vidioFollowerWomen', "") #"女性观众占比(%)"
excelSheet["N" + str(rowNum + 2)] = data.get('vidioFollowerMan', "") #"男性观众占比(%)"
excelSheet["O" + str(rowNum + 2)] = data.get('vidioProductName', "") #"视频主题"
excelSheet["P" + str(rowNum + 2)] = data.get('vidioProductCatagory1', "") + " - " + data.get('vidioProductCatagory2', "") #"关联产品类别"
excelSheet["Q" + str(rowNum + 2)] = data.get('vidioProductPrice1', "") #"关联产品明细"
excelSheet["R" + str(rowNum + 2)] = data.get('vidioProductPrice', "") #"关联商品价格(元)"
excelFile.save(filePath)
excelFile.close()
print("文件写入结束")
def chanMaMaUseEdge(edgeDriverPath=None, totalCount = 2):
if edgeDriverPath is None:
return
startTime = time.time()
targetUrl = r"https://www.chanmama.com/promotionAwemeRank?big_category=&first_category=&second_category="
edgeDriver = webdriver.Edge()
edgeDriver.get(targetUrl)
edgeDriver.maximize_window()
edgeDriver.implicitly_wait(5)
userName = "XXXXXXX"
password = "XXXXXXX"
time.sleep(5)
inputName = WebDriverWait(edgeDriver, 5).until(EC.element_to_be_clickable((By.XPATH, htmlLoginXPathName)))
inputName.send_keys(userName)
inputPwd = WebDriverWait(edgeDriver, 5).until(EC.element_to_be_clickable((By.XPATH, htmlLoginXPathPwd)))
inputPwd.send_keys(password)
loginBtn = WebDriverWait(edgeDriver, 5).until(EC.element_to_be_clickable((By.XPATH, htmlLoginBtnXPath)))
loginBtn.click()
time.sleep(5)
resultDataMapList = []
doGetChanMaMaDataList(resultDataMapList, edgeDriver, totalCount)
edgeDriver.quit()
# 数据写入到excel
writeDataToExcelForChanMamaPath = r"D:\02 Work\4px\2022\20_业余\chanmama.xlsx"
doWriteDataToExcelForChanMama(writeDataToExcelForChanMamaPath, resultDataMapList)
print("总耗时:%d"%(time.time() - startTime))
if __name__ == "__main__":
edgeDriverPath = r"D:\Program_Files\python-3.7.0\msedgedriver.exe"
totalCount = 67
chanMaMaUseEdge(edgeDriverPath, totalCount)
方案二代码:
略