python爬虫:用无头浏览器selenium爬取taptap游戏榜单并保存为csv

用selenium爬取taptap游戏榜单,并保存为csv。

网站:https://www.taptap.com/top/download

字段:榜单、游戏名称、排名、厂商、评分、游戏类型。

每次更新数据需要向下滑动后点击“更多”,才能被xpath爬取到。
selenium滑动参考:https://www.codenong.com/48006078/
在这里插入图片描述

代码如下:

from selenium import webdriver                               # 引入webdriver
from selenium.webdriver.support.wait import WebDriverWait    # 元素定位要用
import time
import pandas as pd
# 1、设置选项:
option = webdriver.ChromeOptions()
option.add_argument('disable-infobars')
# option.add_argument('headless')     #添加了以后就不会弹出浏览器

# 2、添加驱动,声明浏览器:
browser = webdriver.Chrome(
    executable_path='C:/Users/wzq1643/PycharmProjects/untitled/chromedriver.exe',
    options=option
)

name_list=["热门榜","预约榜", "热卖榜","热玩榜","新品榜"]
url_list={"热门榜":"https://www.taptap.com/top/download",
          "新品榜":"https://www.taptap.com/top/new",
          "预约榜":"https://www.taptap.com/top/reserve",
          "热卖榜":"https://www.taptap.com/top/sell",
          "热玩榜":"https://www.taptap.com/top/played"}
df_list=[]
url =url_list[name_list[1]]
time.sleep(10)
try:
    browser.get(url=url)
except:
    pass
for a in [1, 2, 3, 4]:
    time.sleep(10)
    # 向下滑动
    js = "window.scrollTo(0,document.body.scrollHeight)"
    browser.execute_script(js)
    search_button = WebDriverWait(driver=browser, timeout=100).until(
        lambda x: x.find_element_by_xpath('//button[@class="btn btn-primary btn-lg"]')
    )
    try:
        search_button.click()
        print(search_button)
    except:
        pass
time.sleep(10)
search_results1 = WebDriverWait(driver=browser, timeout=5).until(
        lambda x: x.find_elements_by_xpath('//div[@class="top-card-middle"]/a/h4')
        # elements多个,所有符合条件的WebElement,封装为list[]
    )
dict1 = []
dict2 = []
dict3 = []
dict4 = []
dict5 = []
dict6 = []
dict7 = []
dict8 = []
n = 0
for item in search_results1:
    # print(item.text)
    n = n + 1
    dict1.append(item.text)
    dict8.append(n)
search_results2 = WebDriverWait(driver=browser, timeout=5).until(
        lambda x: x.find_elements_by_xpath('//div[@class="top-card-middle"]/p/a')
        # elements多个,所有符合条件的WebElement,封装为list[]
    )
for item in search_results2:
    #   print(item.text)
    dict2.append(item.text)
search_results3 = WebDriverWait(driver=browser, timeout=5).until(
    lambda x: x.find_elements_by_xpath('//div[@class="top-card-middle"]/div[@class="card-middle-score"]/p')
    # elements多个,所有符合条件的WebElement,封装为list[]
)
for item in search_results3:
    print(item.text)
    dict3.append(item.text)
search_results4 = WebDriverWait(driver=browser, timeout=5).until(
    lambda x: x.find_elements_by_xpath('//div[@class="top-card-middle"]/div[@class="card-middle-category"]/a')
    # elements多个,所有符合条件的WebElement,封装为list[]
)
for item in search_results4:
    #   print(item.text)
    dict7.append(item.text)
print(len(dict1), len(dict2), len(dict3), len(dict7),len(dict8))
dict = {"榜单":name_list[1],"排名": dict8, "游戏": dict1, "厂商": dict2, "评分": dict3, "类型": dict7}
df = pd.DataFrame(dict)
print(df)
df_list.append(df)


time.sleep(20)
url =url_list[name_list[0]]
time.sleep(10)
try:
    browser.get(url=url)
except:
    pass
for a in [1, 2, 3, 4]:
    time.sleep(10)
    js = "window.scrollTo(0,document.body.scrollHeight)"
    browser.execute_script(js)
    search_button = WebDriverWait(driver=browser, timeout=100).until(
        lambda x: x.find_element_by_xpath('//button[@class="btn btn-primary btn-lg"]')
    )
    try:
        search_button.click()
        print(search_button)
    except:
        pass
time.sleep(10)
search_results1 = WebDriverWait(driver=browser, timeout=5).until(
        lambda x: x.find_elements_by_xpath('//div[@class="top-card-middle"]/a/h4')
        # elements多个,所有符合条件的WebElement,封装为list[]
    )
dict1 = []
dict2 = []
dict3 = []
dict4 = []
dict5 = []
dict6 = []
dict7 = []
dict8 = []
n = 0
for item in search_results1:
    # print(item.text)
    n = n + 1
    dict1.append(item.text)
    dict8.append(n)
search_results2 = WebDriverWait(driver=browser, timeout=5).until(
        lambda x: x.find_elements_by_xpath('//div[@class="top-card-middle"]/p/a')
        # elements多个,所有符合条件的WebElement,封装为list[]
    )
for item in search_results2:
    #   print(item.text)
    dict2.append(item.text)
search_results3 = WebDriverWait(driver=browser, timeout=5).until(
    lambda x: x.find_elements_by_xpath('//div[@class="top-card-middle"]/div[@class="card-middle-score"]/p')
    # elements多个,所有符合条件的WebElement,封装为list[]
)
for item in search_results3:
    print(item.text)
    dict3.append(item.text)
search_results4 = WebDriverWait(driver=browser, timeout=5).until(
    lambda x: x.find_elements_by_xpath('//div[@class="top-card-middle"]/div[@class="card-middle-category"]/a')
    # elements多个,所有符合条件的WebElement,封装为list[]
)
for item in search_results4:
    #   print(item.text)
    dict7.append(item.text)
print(len(dict1), len(dict2), len(dict3), len(dict7),len(dict8))
dict = {"榜单":name_list[0],"排名": dict8, "游戏": dict1, "厂商": dict2, "评分": dict3, "类型": dict7}
df = pd.DataFrame(dict)
print(df)
df_list.append(df)

time.sleep(20)
url =url_list[name_list[2]]
time.sleep(10)
try:
    browser.get(url=url)
except:
    pass
for a in [1, 2, 3, 4]:
    time.sleep(10)
    js = "window.scrollTo(0,document.body.scrollHeight)"
    browser.execute_script(js)
    search_button = WebDriverWait(driver=browser, timeout=100).until(
        lambda x: x.find_element_by_xpath('//button[@class="btn btn-primary btn-lg"]')
    )
    try:
        search_button.click()
        print(search_button)
    except:
        pass
time.sleep(10)
search_results1 = WebDriverWait(driver=browser, timeout=5).until(
        lambda x: x.find_elements_by_xpath('//div[@class="top-card-middle"]/a/h4')
        # elements多个,所有符合条件的WebElement,封装为list[]
    )
dict1 = []
dict2 = []
dict3 = []
dict4 = []
dict5 = []
dict6 = []
dict7 = []
dict8 = []
n = 0
for item in search_results1:
    # print(item.text)
    n = n + 1
    dict1.append(item.text)
    dict8.append(n)
search_results2 = WebDriverWait(driver=browser, timeout=5).until(
        lambda x: x.find_elements_by_xpath('//div[@class="top-card-middle"]/p/a')
        # elements多个,所有符合条件的WebElement,封装为list[]
    )
for item in search_results2:
    #   print(item.text)
    dict2.append(item.text)
search_results3 = WebDriverWait(driver=browser, timeout=5).until(
    lambda x: x.find_elements_by_xpath('//div[@class="top-card-middle"]/div[@class="card-middle-score"]/p')
    # elements多个,所有符合条件的WebElement,封装为list[]
)
for item in search_results3:
    print(item.text)
    dict3.append(item.text)
search_results4 = WebDriverWait(driver=browser, timeout=5).until(
    lambda x: x.find_elements_by_xpath('//div[@class="top-card-middle"]/div[@class="card-middle-category"]/a')
    # elements多个,所有符合条件的WebElement,封装为list[]
)
for item in search_results4:
    #   print(item.text)
    dict7.append(item.text)
print(len(dict1), len(dict2), len(dict3), len(dict7),len(dict8))
dict = {"榜单":name_list[2],"排名": dict8, "游戏": dict1, "厂商": dict2, "评分": dict3, "类型": dict7}
df = pd.DataFrame(dict)
print(df)


df_list.append(df)

time.sleep(20)
url =url_list[name_list[3]]
time.sleep(10)
try:
    browser.get(url=url)
except:
    pass
for a in [1, 2, 3, 4]:
    time.sleep(10)
    js = "window.scrollTo(0,document.body.scrollHeight)"
    browser.execute_script(js)
    search_button = WebDriverWait(driver=browser, timeout=100).until(
        lambda x: x.find_element_by_xpath('//button[@class="btn btn-primary btn-lg"]')
    )
    try:
        search_button.click()
        print(search_button)
    except:
        pass
time.sleep(10)
search_results1 = WebDriverWait(driver=browser, timeout=5).until(
        lambda x: x.find_elements_by_xpath('//div[@class="top-card-middle"]/a/h4')
        # elements多个,所有符合条件的WebElement,封装为list[]
    )
dict1 = []
dict2 = []
dict3 = []
dict4 = []
dict5 = []
dict6 = []
dict7 = []
dict8 = []
n = 0
for item in search_results1:
    # print(item.text)
    n = n + 1
    dict1.append(item.text)
    dict8.append(n)
search_results2 = WebDriverWait(driver=browser, timeout=5).until(
        lambda x: x.find_elements_by_xpath('//div[@class="top-card-middle"]/p/a')
        # elements多个,所有符合条件的WebElement,封装为list[]
    )
for item in search_results2:
    #   print(item.text)
    dict2.append(item.text)
search_results3 = WebDriverWait(driver=browser, timeout=5).until(
    lambda x: x.find_elements_by_xpath('//div[@class="top-card-middle"]/div[@class="card-middle-score"]/p')
    # elements多个,所有符合条件的WebElement,封装为list[]
)
for item in search_results3:
    print(item.text)
    dict3.append(item.text)
search_results4 = WebDriverWait(driver=browser, timeout=5).until(
    lambda x: x.find_elements_by_xpath('//div[@class="top-card-middle"]/div[@class="card-middle-category"]/a')
    # elements多个,所有符合条件的WebElement,封装为list[]
)
for item in search_results4:
    #   print(item.text)
    dict7.append(item.text)
print(len(dict1), len(dict2), len(dict3), len(dict7),len(dict8))
dict = {"榜单":name_list[3],"排名": dict8, "游戏": dict1, "厂商": dict2, "评分": dict3, "类型": dict7}
df = pd.DataFrame(dict)
print(df)
df_list.append(df)

time.sleep(20)
url =url_list[name_list[4]]
time.sleep(10)
try:
    browser.get(url=url)
except:
    pass
for a in [1, 2, 3, 4]:
    time.sleep(10)
    js = "window.scrollTo(0,document.body.scrollHeight)"
    browser.execute_script(js)
    search_button = WebDriverWait(driver=browser, timeout=100).until(
        lambda x: x.find_element_by_xpath('//button[@class="btn btn-primary btn-lg"]')
    )
    try:
        search_button.click()
        print(search_button)
    except:
        pass
time.sleep(10)
search_results1 = WebDriverWait(driver=browser, timeout=5).until(
        lambda x: x.find_elements_by_xpath('//div[@class="top-card-middle"]/a/h4')
        # elements多个,所有符合条件的WebElement,封装为list[]
    )
dict1 = []
dict2 = []
dict3 = []
dict4 = []
dict5 = []
dict6 = []
dict7 = []
dict8 = []
n = 0
for item in search_results1:
    # print(item.text)
    n = n + 1
    dict1.append(item.text)
    dict8.append(n)
search_results2 = WebDriverWait(driver=browser, timeout=5).until(
        lambda x: x.find_elements_by_xpath('//div[@class="top-card-middle"]/p/a')
        # elements多个,所有符合条件的WebElement,封装为list[]
    )
for item in search_results2:
    #   print(item.text)
    dict2.append(item.text)
search_results3 = WebDriverWait(driver=browser, timeout=5).until(
    lambda x: x.find_elements_by_xpath('//div[@class="top-card-middle"]/div[@class="card-middle-score"]/p')
    # elements多个,所有符合条件的WebElement,封装为list[]
)
for item in search_results3:
    print(item.text)
    dict3.append(item.text)
search_results4 = WebDriverWait(driver=browser, timeout=5).until(
    lambda x: x.find_elements_by_xpath('//div[@class="top-card-middle"]/div[@class="card-middle-category"]/a')
    # elements多个,所有符合条件的WebElement,封装为list[]
)
for item in search_results4:
    #   print(item.text)
    dict7.append(item.text)
print(len(dict1), len(dict2), len(dict3), len(dict7),len(dict8))
dict = {"榜单":name_list[4],"排名": dict8, "游戏": dict1, "厂商": dict2, "评分": dict3, "类型": dict7}
df = pd.DataFrame(dict)
print(df)
df_list.append(df)
browser.quit()   #关闭浏览器

# 拼接
df_end=pd.concat([df_list[0],df_list[1],df_list[2],df_list[3],df_list[4]],axis=0,join="inner",ignore_index=True)
print(df_end)
df_end.to_csv("C:/Users/wzq1643/Desktop/taptapgame.csv",encoding="utf_8_sig")

结果如下:
在这里插入图片描述

  • 1
    点赞
  • 12
    收藏
    觉得还不错? 一键收藏
  • 3
    评论
要使用Python爬取Taptap论坛数据,可以按照以下步骤进行: 1. 安装必要的Python库:requests、BeautifulSoup和pandas。可以使用pip命令来安装这些库。 2. 找到需要爬取的Taptap论坛页面的URL。例如,我们可以爬取“神都夜行录”游戏的论坛页面:https://www.taptap.com/app/1369/topic。 3. 使用requests库发送HTTP请求,获取网页HTML代码。可以使用get()方法来发送GET请求,然后使用.text属性获取HTML代码。 4. 使用BeautifulSoup库解析HTML代码,提取需要的数据。可以使用find()或find_all()方法来查找HTML标签,然后使用.text属性获取标签的文本内容。 5. 将提取的数据存储到CSV文件中。可以使用pandas库创建DataFrame对象,然后使用to_csv()方法将数据保存CSV文件。 以下是一个示例代码,可以爬取“神都夜行录”游戏的论坛页面,并将发帖人、发帖时间和帖子内容保存CSV文件中: ```python import requests from bs4 import BeautifulSoup import pandas as pd url = 'https://www.taptap.com/app/1369/topic' response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') posts = [] for post in soup.find_all('div', class_='topic-item'): author = post.find('div', class_='author-name').text.strip() time = post.find('span', class_='created-at').text.strip() content = post.find('div', class_='topic-item-body').text.strip() posts.append({'author': author, 'time': time, 'content': content}) df = pd.DataFrame(posts) df.to_csv('shendu.csv', index=False) ``` 运行以上代码后,将会在当前目录下生成一个名为“shendu.csv”的CSV文件,其中包含了论坛页面中所有帖子的发帖人、发帖时间和帖子内容。
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值