Pythom爬取网站方案实现探讨（附完整代码）

最新推荐文章于 2024-02-16 10:58:26 发布

蓝鹰_李辉

最新推荐文章于 2024-02-16 10:58:26 发布

阅读量452

点赞数

分类专栏： Python 文章标签： python 爬虫 selenium 网络爬虫

本文链接：https://blog.csdn.net/ghost_ly555/article/details/124500406

版权

Python 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

背景

比如，我想爬这个页面，这些选择条件下：这个列表的一些关键数据，以及每一行里面【详情】跳转到新的要给页面里面的关键数据。

实现方案

一、Selenium-Python中文文档自动化测试方式

存在问题：

1、每个界面元素，要逐一跳转，要跳转到视内。这样获取一条数据行，要七八秒，

2、账号不能再其他地方登录，不然会提示：重新登录

缺点：耗时非常久。

二、使用Python +urllib

其实就是使用http多线程调用，调用时候设置请求header

存在问题：

每个页面的cookie不同。行不通。

优点：

如果cookie相同：非常快能得到想要的数据

探讨：

网站数据的时候，方案一、方案二的问题，都遇到了。

所以有没有大神指点下，更完美、高效的方案？

代码

方案一代码：

说明：浏览器：edge , 用户名、密码自己填写

# coding:utf-8

import time
import urllib3

print("忽略SSL证书")
# https请求 忽略 SSl证书
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

import openpyxl

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

htmlLoginXPathName = r'/html/body/div[1]/div[1]/div/div[1]/div[2]/div/div[2]/div[2]/div/form/div[1]/div/div/input'
htmlLoginXPathPwd = r"/html/body/div[1]/div[1]/div/div[1]/div[2]/div/div[2]/div[2]/div/form/div[2]/div/div/input"
htmlLoginBtnXPath = r"/html/body/div[1]/div[1]/div/div[1]/div[2]/div/div[2]/div[2]/div/form/div[4]/div/button"
btnSelectAllXPathSpan = r"/html/body/div[1]/div[1]/div[2]/div/div/div/div[2]/div[1]/div/div/div/div/div[1]/div/span"
btnSelectAllXPath = r"/html/body/div[1]/div[1]/div[2]/div/div/div/div[2]/div[1]/div/div/div/div/div[1]/div"
btnYueBangXPath = r"/html/body/div[1]/div[1]/div[2]/div/div/div/div[2]/div[2]/div/div[2]/div/label[3]/input"
btnYueBangXPath = r"/html/body/div[1]/div[1]/div[2]/div/div/div/div[2]/div[2]/div/div[2]/div/label[3]/input"
btnYueBangXPathSpan = r"/html/body/div[1]/div[1]/div[2]/div/div/div/div[2]/div[2]/div/div[2]/div/label[3]/span"
btnDiggXPath = r"/html/body/div[1]/div[1]/div[2]/div/div/div/div[2]/div[3]/div[2]/label[3]/input"
btnDiggXPathSpan = r"/html/body/div[1]/div[1]/div[2]/div/div/div/div[2]/div[3]/div[2]/label[3]/span"
btnVidioDetailXPath = r"/html/body/div[1]/div[1]/div[2]/div/div/div/div[4]/div[1]/div/div[2]/table/tbody/tr[5]/td[9]/div/a[1]"
btnVidioPageMoreXPath = r"/html/body/div[1]/div[1]/div[2]/div/div/div/div[4]/div[2]/div/span[2]"


def doGetChanMaMaForChooseCondition(edgeDriver=None, strType = None, rank = 1):
    try:
        targetUrl = r"https://www.chanmama.com/promotionAwemeRank?big_category=&first_category=&second_category="
        edgeDriver.get(targetUrl)
        time.sleep(2)
        btnSelectAll = WebDriverWait(edgeDriver, 5).until(EC.presence_of_element_located((By.XPATH, btnSelectAllXPath)))
        edgeDriver.execute_script("arguments[0].scrollIntoView(true);", btnSelectAll)
        edgeDriver.execute_script("arguments[0].click()", btnSelectAll)
        time.sleep(1)
        btnYueBang = WebDriverWait(edgeDriver, 5).until(EC.presence_of_element_located((By.XPATH, btnYueBangXPath)))
        edgeDriver.execute_script("arguments[0].click()", btnYueBang)
        time.sleep(1)
        btnDigg = WebDriverWait(edgeDriver, 5).until(EC.presence_of_element_located((By.XPATH, btnDiggXPath)))
        edgeDriver.execute_script("arguments[0].click()", btnDigg)

        time.sleep(1)
        btnPageMoreXpath = r"/html/body/div[1]/div[1]/div[2]/div/div/div/div[4]/div[2]/div/span[2]"
        if rank / 50 > 0:
            for a in range(0, int(rank / 50)):
                # print("more page - %d"%(a) )
                try:
                    btnPageMore = WebDriverWait(edgeDriver, 5).until(EC.presence_of_element_located((By.XPATH, btnPageMoreXpath)))
                    edgeDriver.execute_script("arguments[0].scrollIntoView(true);", btnPageMore)
                    time.sleep(1)
                    # edgeDriver.execute_script("arguments[0].click()", btnPageMore)
                    btnPageMore.click()
                except Exception as ex:
                    print("btnPageMore 异常")
                time.sleep(2)
    except Exception as ex:
        try:
            print("错误行号：%s, 异常：%s" % (str(ex.__traceback__.tb_next.tb_lineno), ex.__str__()[:600]))
        except Exception as ex2:
            print("异常中，异常， %s" % (ex2.__str__()))


def doGetChanMaMaDataList(resultDataMapList=None, edgeDriver=None, totalCount = 2):
    dataCount = totalCount
    for i in range(98, 101 + 1):
        resultLineDataMap = {}
        resultLineDataMap['rank'] = i
        try:
            resultDataMapList.append(resultLineDataMap)
            doAnalysisChanMaMa(edgeDriver, i, resultLineDataMap)
        except Exception as ex:
            currentUrl = edgeDriver.current_url
            print("currentUrl: %s"%currentUrl)
            if currentUrl.__eq__("https://www.chanmama.com/404"):
                # doGetChanMaMaForChooseCondition(edgeDriver=edgeDriver, strType="rank : " + str(i) + " , 页面不存在", rank=i)
                continue
            print("rank: %s, 异常：%s, data: %s" % (str(i), ex.__str__(), resultLineDataMap.__str__()))
    sorted(resultDataMapList, key=lambda x: x['rank'])

def doAnalysisChanMaMa(edgeDriver, i, resultLineDataMap):
    xpathTd1 = r""
    # 视频名称
    xpathTd2_1 = r"/html/body/div[1]/div[1]/div[2]/div/div/div/div[4]/div[1]/div/div[2]/table/tbody/tr[" + str(i) + r"]/td[2]/div/div[2]/div[1]/a"
    # 关联商品
    xpathTd2_2 = r"/html/body/div[1]/div[1]/div[2]/div/div/div/div[4]/div[1]/div/div[2]/table/tbody/tr[" + str(i) + r"]/td[2]/div/div[2]/div[2]/div[2]/div/a"
    # 作者
    xpathTd3 = r"/html/body/div[1]/div[1]/div[2]/div/div/div/div[4]/div[1]/div/div[2]/table/tbody/tr[" + str(i) + r"]/td[3]/div/div[2]/div[1]/a"
    # 点赞数
    xpathTd4 = r"/html/body/div[1]/div[1]/div[2]/div/div/div/div[4]/div[1]/div/div[2]/table/tbody/tr[" + str(i) + r"]/td[4]/div"
    # 转发数
    xpathTd5 = r"/html/body/div[1]/div[1]/div[2]/div/div/div/div[4]/div[1]/div/div[2]/table/tbody/tr[" + str(i) + r"]/td[5]/div"
    # 评论数
    xpathTd6 = r"/html/body/div[1]/div[1]/div[2]/div/div/div/div[4]/div[1]/div/div[2]/table/tbody/tr[" + str(i) + r"]/td[6]/div"
    # 详情
    xpathTd7 = r"/html/body/div[1]/div[1]/div[2]/div/div/div/div[4]/div[1]/div/div[2]/table/tbody/tr[" + str(i) + r"]/td[9]/div/a[1]"
    doGetChanMaMaForChooseCondition(edgeDriver=edgeDriver, strType="rank : " + str(i) + " , 解析视频，开始", rank=i)


    xpathTd2_1_text =  WebDriverWait(edgeDriver, 5).until(EC.presence_of_element_located((By.XPATH, xpathTd2_1)))
    edgeDriver.execute_script("arguments[0].scrollIntoView(true);", xpathTd2_1_text)
    resultLineDataMap['vidioTitle'] = xpathTd2_1_text.text
    resultLineDataMap['vidioProductTitle'] = WebDriverWait(edgeDriver, 5).until(EC.presence_of_element_located((By.XPATH, xpathTd2_2))).text
    resultLineDataMap['vidioAuthor'] = WebDriverWait(edgeDriver, 5).until(EC.presence_of_element_located((By.XPATH, xpathTd3))).text
    resultLineDataMap['vidioDigg'] = WebDriverWait(edgeDriver, 5).until(EC.presence_of_element_located((By.XPATH, xpathTd4))).text
    resultLineDataMap['vidioShare'] = WebDriverWait(edgeDriver, 5).until(EC.presence_of_element_located((By.XPATH, xpathTd5))).text
    resultLineDataMap['vidioComment'] = WebDriverWait(edgeDriver, 5).until(EC.presence_of_element_located((By.XPATH, xpathTd6))).text
    # 访问作者
    # 获取作者的链接：
    # https://www.chanmama.com/authorDetail/2506485403493080
    authorHrefText = WebDriverWait(edgeDriver, 5).until(EC.element_to_be_clickable((By.XPATH, xpathTd3))).get_attribute("href")
    edgeDriver.get(authorHrefText)
    mcnXpath = r"/html/body/div[1]/div[1]/div[3]/div[1]/div/div[1]/div[2]/div[1]/div[4]/div/div[2]"
    followerCountXpath = r"/html/body/div[1]/div[1]/div[3]/div[1]/div/div[1]/div[3]/div[1]/div[1]/div[2]/div[1]"
    resultLineDataMap['authorMcnName'] = WebDriverWait(edgeDriver, 5).until(EC.presence_of_element_located((By.XPATH, mcnXpath))).text
    resultLineDataMap['authorFollowerCount'] = WebDriverWait(edgeDriver, 5).until(EC.presence_of_element_located((By.XPATH, followerCountXpath))).text
    doGetChanMaMaForChooseCondition(edgeDriver=edgeDriver, strType="rank : " + str(i) + " , 解析作者，开始", rank=i)

    # 访问视频详情
    btnVidioDetailUrlText = WebDriverWait(edgeDriver, 5).until(EC.element_to_be_clickable((By.XPATH, xpathTd7))).get_attribute("href")
    edgeDriver.get(btnVidioDetailUrlText)
    vidioPublishTimeXpath = r"/html/body/div[1]/div[1]/div[1]/div/div[2]/div[2]/div/div[1]/div[1]/div/div/div[2]/div[1]"
    vidioTimeLength = r"/html/body/div[1]/div[1]/div[1]/div/div[2]/div[2]/div/div[1]/div[1]/div/div/div[2]/div[2]"
    productPriceXpath = r"/html/body/div[1]/div[1]/div[1]/div/div[2]/div[3]/div[2]/div[2]/div/div[2]/div[1]/div/table/tbody/tr/td[2]/div/div[2]"
    productNameXpath = r"/html/body/div[1]/div[1]/div[1]/div/div[2]/div[3]/div[2]/div[2]/div/div[2]/div[1]/div/table/tbody/tr/td[1]/div/div[2]/a"
    productCatagory1Xpath = r"/html/body/div[1]/div[1]/div[1]/div/div[2]/div[3]/div[2]/div[5]/div/div[3]/div[1]/div[2]/div/div[1]"
    productCatagory2Xpath = r"/html/body/div[1]/div[1]/div[1]/div/div[2]/div[3]/div[2]/div[5]/div/div[3]/div[1]/div[2]/div/div[2]"
    vidioFollowerManXpath = r"/html/body/div[1]/div[1]/div[1]/div/div[2]/div[3]/div[2]/div[4]/div/div/div[2]/div/div/div/div[2]/div/span[1]"
    resultLineDataMap['vidioTime'] = WebDriverWait(edgeDriver, 5).until(EC.presence_of_element_located((By.XPATH, vidioTimeLength))).text.replace("视频时长 ", "")
    resultLineDataMap['vidioPublishTime'] = WebDriverWait(edgeDriver, 5).until(EC.presence_of_element_located((By.XPATH, vidioPublishTimeXpath))).text.replace("发布于 ", "")
    resultLineDataMap['vidioFollowerMan'] = WebDriverWait(edgeDriver, 5).until(EC.presence_of_element_located((By.XPATH, vidioFollowerManXpath))).text.replace("男 ",                                                                                                                  "")
    resultLineDataMap['vidioFollowerWomen'] = str( 100 - int(resultLineDataMap['vidioFollowerMan'].replace("%", ""))) + "%"
    resultLineDataMap['vidioProductCatagory1'] = WebDriverWait(edgeDriver, 5).until(EC.presence_of_element_located((By.XPATH, productCatagory1Xpath))).text
    resultLineDataMap['vidioProductCatagory2'] = WebDriverWait(edgeDriver, 5).until(EC.presence_of_element_located((By.XPATH, productCatagory2Xpath))).text
    resultLineDataMap['vidioProductName'] = WebDriverWait(edgeDriver, 5).until(EC.presence_of_element_located((By.XPATH, productNameXpath))).text
    resultLineDataMap['vidioProductPrice'] = WebDriverWait(edgeDriver, 5).until(EC.presence_of_element_located((By.XPATH, productPriceXpath))).text
    doGetChanMaMaForChooseCondition(edgeDriver=edgeDriver, strType="rank : " + str(i) + " , 解析视频详情，开始", rank=i)

    print("排名：%d , 数据：%s" % (i, resultLineDataMap.__str__()))


def doWriteDataToExcelForChanMama(filePath=None, dataList=None):
    if dataList is None:
        return
    print("执行文件写入, data: %s"%(dataList.__str__()))
    excelFile = openpyxl.load_workbook(filePath)
    excelSheet = excelFile['Sheet1']

    excelSheet["A" + str(2)] = "排名"
    excelSheet["B" + str(2)] = "视频"
    excelSheet["C" + str(2)] = "关联商品"
    excelSheet["D" + str(2)] = "达人"
    excelSheet["E" + str(2)] = "点赞数"
    excelSheet["F" + str(2)] = "博主粉丝数"
    excelSheet["G" + str(2)] = "博主是否签约MSN机构"
    excelSheet["H" + str(2)] = "转发数"
    excelSheet["I" + str(2)] = "评论数"
    excelSheet["J" + str(2)] = "发布时间"
    excelSheet["K" + str(2)] = "视频时长（秒）"
    excelSheet["L" + str(2)] = "视频标题字符"
    excelSheet["M" + str(2)] = "女性观众占比（%）"
    excelSheet["N" + str(2)] = "男性观众占比（%）"
    excelSheet["O" + str(2)] = "视频主题"
    excelSheet["P" + str(2)] = "关联产品类别"
    excelSheet["Q" + str(2)] = "关联产品明细"
    excelSheet["R" + str(2)] = "关联商品价格（元）"

    rowNum = 0
    for data in dataList:
        rowNum = rowNum + 1
        excelSheet["A" + str(rowNum + 2)] = data["rank"]
        excelSheet["B" + str(rowNum + 2)] = data.get('vidioTitle', "") #"视频"
        excelSheet["C" + str(rowNum + 2)] = data.get('vidioProductTitle', "") #"关联商品"
        excelSheet["D" + str(rowNum + 2)] = data.get('vidioAuthor', "") #"达人"
        excelSheet["E" + str(rowNum + 2)] = data.get('vidioDigg', "") #"点赞数"
        excelSheet["F" + str(rowNum + 2)] = data.get('authorFollowerCount', "") #"博主粉丝数"
        excelSheet["G" + str(rowNum + 2)] = data.get('authorMcnName', "") #"博主是否签约MSN机构"
        excelSheet["H" + str(rowNum + 2)] = data.get('vidioShare', "") #"转发数"
        excelSheet["I" + str(rowNum + 2)] = data.get('vidioComment', "") #"评论数"
        excelSheet["J" + str(rowNum + 2)] = data.get('vidioPublishTime', "") #"发布时间"
        vidioTimeStr = data.get('vidioTime', "0分0秒")
        fenPos = vidioTimeStr.find("分") if vidioTimeStr.find("分") > 0 else 0
        intFen = 60 * (int(vidioTimeStr[: fenPos] if fenPos > 0 else "0"))
        miaoPos = vidioTimeStr.find("秒") if vidioTimeStr.find("秒") > 0 else 0
        intMiao = int(vidioTimeStr[(fenPos + 1 if fenPos > 0 else 0): miaoPos]) if miaoPos > 0 else 0
        excelSheet["K" + str(rowNum + 2)] = intFen + intMiao #"视频时长（秒）"
        excelSheet["L" + str(rowNum + 2)] = "" #"视频标题字符"
        excelSheet["M" + str(rowNum + 2)] = data.get('vidioFollowerWomen', "") #"女性观众占比（%）"
        excelSheet["N" + str(rowNum + 2)] = data.get('vidioFollowerMan', "") #"男性观众占比（%）"
        excelSheet["O" + str(rowNum + 2)] = data.get('vidioProductName', "") #"视频主题"
        excelSheet["P" + str(rowNum + 2)] = data.get('vidioProductCatagory1', "") + " - " + data.get('vidioProductCatagory2', "") #"关联产品类别"
        excelSheet["Q" + str(rowNum + 2)] = data.get('vidioProductPrice1', "") #"关联产品明细"
        excelSheet["R" + str(rowNum + 2)] = data.get('vidioProductPrice', "") #"关联商品价格（元）"

    excelFile.save(filePath)
    excelFile.close()
    print("文件写入结束")


def chanMaMaUseEdge(edgeDriverPath=None, totalCount = 2):
    if edgeDriverPath is None:
        return
    startTime = time.time()
    targetUrl = r"https://www.chanmama.com/promotionAwemeRank?big_category=&first_category=&second_category="
    edgeDriver = webdriver.Edge()
    edgeDriver.get(targetUrl)
    edgeDriver.maximize_window()
    edgeDriver.implicitly_wait(5)
    userName = "XXXXXXX"
    password = "XXXXXXX"
    time.sleep(5)

    inputName = WebDriverWait(edgeDriver, 5).until(EC.element_to_be_clickable((By.XPATH, htmlLoginXPathName)))
    inputName.send_keys(userName)
    inputPwd = WebDriverWait(edgeDriver, 5).until(EC.element_to_be_clickable((By.XPATH, htmlLoginXPathPwd)))
    inputPwd.send_keys(password)
    loginBtn = WebDriverWait(edgeDriver, 5).until(EC.element_to_be_clickable((By.XPATH, htmlLoginBtnXPath)))
    loginBtn.click()
    time.sleep(5)

    resultDataMapList = []
    doGetChanMaMaDataList(resultDataMapList, edgeDriver, totalCount)
    edgeDriver.quit()

    # 数据写入到excel
    writeDataToExcelForChanMamaPath = r"D:\02 Work\4px\2022\20_业余\chanmama.xlsx"
    doWriteDataToExcelForChanMama(writeDataToExcelForChanMamaPath, resultDataMapList)

    print("总耗时：%d"%(time.time() - startTime))


if __name__ == "__main__":
    edgeDriverPath = r"D:\Program_Files\python-3.7.0\msedgedriver.exe"
    totalCount = 67
    chanMaMaUseEdge(edgeDriverPath, totalCount)

方案二代码：

略