用selenium爬取taptap游戏榜单,并保存为csv。
网站:https://www.taptap.com/top/download
字段:榜单、游戏名称、排名、厂商、评分、游戏类型。
每次更新数据需要向下滑动后点击“更多”,才能被xpath爬取到。
selenium滑动参考:https://www.codenong.com/48006078/
代码如下:
from selenium import webdriver # 引入webdriver
from selenium.webdriver.support.wait import WebDriverWait # 元素定位要用
import time
import pandas as pd
# 1、设置选项:
option = webdriver.ChromeOptions()
option.add_argument('disable-infobars')
# option.add_argument('headless') #添加了以后就不会弹出浏览器
# 2、添加驱动,声明浏览器:
browser = webdriver.Chrome(
executable_path='C:/Users/wzq1643/PycharmProjects/untitled/chromedriver.exe',
options=option
)
name_list=["热门榜","预约榜", "热卖榜","热玩榜","新品榜"]
url_list={"热门榜":"https://www.taptap.com/top/download",
"新品榜":"https://www.taptap.com/top/new",
"预约榜":"https://www.taptap.com/top/reserve",
"热卖榜":"https://www.taptap.com/top/sell",
"热玩榜":"https://www.taptap.com/top/played"}
df_list=[]
url =url_list[name_list[1]]
time.sleep(10)
try:
browser.get(url=url)
except:
pass
for a in [1, 2, 3, 4]:
time.sleep(10)
# 向下滑动
js = "window.scrollTo(0,document.body.scrollHeight)"
browser.execute_script(js)
search_button = WebDriverWait(driver=browser, timeout=100).until(
lambda x: x.find_element_by_xpath('//button[@class="btn btn-primary btn-lg"]')
)
try:
search_button.click()
print(search_button)
except:
pass
time.sleep(10)
search_results1 = WebDriverWait(driver=browser, timeout=5).until(
lambda x: x.find_elements_by_xpath('//div[@class="top-card-middle"]/a/h4')
# elements多个,所有符合条件的WebElement,封装为list[]
)
dict1 = []
dict2 = []
dict3 = []
dict4 = []
dict5 = []
dict6 = []
dict7 = []
dict8 = []
n = 0
for item in search_results1:
# print(item.text)
n = n + 1
dict1.append(item.text)
dict8.append(n)
search_results2 = WebDriverWait(driver=browser, timeout=5).until(
lambda x: x.find_elements_by_xpath('//div[@class="top-card-middle"]/p/a')
# elements多个,所有符合条件的WebElement,封装为list[]
)
for item in search_results2:
# print(item.text)
dict2.append(item.text)
search_results3 = WebDriverWait(driver=browser, timeout=5).until(
lambda x: x.find_elements_by_xpath('//div[@class="top-card-middle"]/div[@class="card-middle-score"]/p')
# elements多个,所有符合条件的WebElement,封装为list[]
)
for item in search_results3:
print(item.text)
dict3.append(item.text)
search_results4 = WebDriverWait(driver=browser, timeout=5).until(
lambda x: x.find_elements_by_xpath('//div[@class="top-card-middle"]/div[@class="card-middle-category"]/a')
# elements多个,所有符合条件的WebElement,封装为list[]
)
for item in search_results4:
# print(item.text)
dict7.append(item.text)
print(len(dict1), len(dict2), len(dict3), len(dict7),len(dict8))
dict = {"榜单":name_list[1],"排名": dict8, "游戏": dict1, "厂商": dict2, "评分": dict3, "类型": dict7}
df = pd.DataFrame(dict)
print(df)
df_list.append(df)
time.sleep(20)
url =url_list[name_list[0]]
time.sleep(10)
try:
browser.get(url=url)
except:
pass
for a in [1, 2, 3, 4]:
time.sleep(10)
js = "window.scrollTo(0,document.body.scrollHeight)"
browser.execute_script(js)
search_button = WebDriverWait(driver=browser, timeout=100).until(
lambda x: x.find_element_by_xpath('//button[@class="btn btn-primary btn-lg"]')
)
try:
search_button.click()
print(search_button)
except:
pass
time.sleep(10)
search_results1 = WebDriverWait(driver=browser, timeout=5).until(
lambda x: x.find_elements_by_xpath('//div[@class="top-card-middle"]/a/h4')
# elements多个,所有符合条件的WebElement,封装为list[]
)
dict1 = []
dict2 = []
dict3 = []
dict4 = []
dict5 = []
dict6 = []
dict7 = []
dict8 = []
n = 0
for item in search_results1:
# print(item.text)
n = n + 1
dict1.append(item.text)
dict8.append(n)
search_results2 = WebDriverWait(driver=browser, timeout=5).until(
lambda x: x.find_elements_by_xpath('//div[@class="top-card-middle"]/p/a')
# elements多个,所有符合条件的WebElement,封装为list[]
)
for item in search_results2:
# print(item.text)
dict2.append(item.text)
search_results3 = WebDriverWait(driver=browser, timeout=5).until(
lambda x: x.find_elements_by_xpath('//div[@class="top-card-middle"]/div[@class="card-middle-score"]/p')
# elements多个,所有符合条件的WebElement,封装为list[]
)
for item in search_results3:
print(item.text)
dict3.append(item.text)
search_results4 = WebDriverWait(driver=browser, timeout=5).until(
lambda x: x.find_elements_by_xpath('//div[@class="top-card-middle"]/div[@class="card-middle-category"]/a')
# elements多个,所有符合条件的WebElement,封装为list[]
)
for item in search_results4:
# print(item.text)
dict7.append(item.text)
print(len(dict1), len(dict2), len(dict3), len(dict7),len(dict8))
dict = {"榜单":name_list[0],"排名": dict8, "游戏": dict1, "厂商": dict2, "评分": dict3, "类型": dict7}
df = pd.DataFrame(dict)
print(df)
df_list.append(df)
time.sleep(20)
url =url_list[name_list[2]]
time.sleep(10)
try:
browser.get(url=url)
except:
pass
for a in [1, 2, 3, 4]:
time.sleep(10)
js = "window.scrollTo(0,document.body.scrollHeight)"
browser.execute_script(js)
search_button = WebDriverWait(driver=browser, timeout=100).until(
lambda x: x.find_element_by_xpath('//button[@class="btn btn-primary btn-lg"]')
)
try:
search_button.click()
print(search_button)
except:
pass
time.sleep(10)
search_results1 = WebDriverWait(driver=browser, timeout=5).until(
lambda x: x.find_elements_by_xpath('//div[@class="top-card-middle"]/a/h4')
# elements多个,所有符合条件的WebElement,封装为list[]
)
dict1 = []
dict2 = []
dict3 = []
dict4 = []
dict5 = []
dict6 = []
dict7 = []
dict8 = []
n = 0
for item in search_results1:
# print(item.text)
n = n + 1
dict1.append(item.text)
dict8.append(n)
search_results2 = WebDriverWait(driver=browser, timeout=5).until(
lambda x: x.find_elements_by_xpath('//div[@class="top-card-middle"]/p/a')
# elements多个,所有符合条件的WebElement,封装为list[]
)
for item in search_results2:
# print(item.text)
dict2.append(item.text)
search_results3 = WebDriverWait(driver=browser, timeout=5).until(
lambda x: x.find_elements_by_xpath('//div[@class="top-card-middle"]/div[@class="card-middle-score"]/p')
# elements多个,所有符合条件的WebElement,封装为list[]
)
for item in search_results3:
print(item.text)
dict3.append(item.text)
search_results4 = WebDriverWait(driver=browser, timeout=5).until(
lambda x: x.find_elements_by_xpath('//div[@class="top-card-middle"]/div[@class="card-middle-category"]/a')
# elements多个,所有符合条件的WebElement,封装为list[]
)
for item in search_results4:
# print(item.text)
dict7.append(item.text)
print(len(dict1), len(dict2), len(dict3), len(dict7),len(dict8))
dict = {"榜单":name_list[2],"排名": dict8, "游戏": dict1, "厂商": dict2, "评分": dict3, "类型": dict7}
df = pd.DataFrame(dict)
print(df)
df_list.append(df)
time.sleep(20)
url =url_list[name_list[3]]
time.sleep(10)
try:
browser.get(url=url)
except:
pass
for a in [1, 2, 3, 4]:
time.sleep(10)
js = "window.scrollTo(0,document.body.scrollHeight)"
browser.execute_script(js)
search_button = WebDriverWait(driver=browser, timeout=100).until(
lambda x: x.find_element_by_xpath('//button[@class="btn btn-primary btn-lg"]')
)
try:
search_button.click()
print(search_button)
except:
pass
time.sleep(10)
search_results1 = WebDriverWait(driver=browser, timeout=5).until(
lambda x: x.find_elements_by_xpath('//div[@class="top-card-middle"]/a/h4')
# elements多个,所有符合条件的WebElement,封装为list[]
)
dict1 = []
dict2 = []
dict3 = []
dict4 = []
dict5 = []
dict6 = []
dict7 = []
dict8 = []
n = 0
for item in search_results1:
# print(item.text)
n = n + 1
dict1.append(item.text)
dict8.append(n)
search_results2 = WebDriverWait(driver=browser, timeout=5).until(
lambda x: x.find_elements_by_xpath('//div[@class="top-card-middle"]/p/a')
# elements多个,所有符合条件的WebElement,封装为list[]
)
for item in search_results2:
# print(item.text)
dict2.append(item.text)
search_results3 = WebDriverWait(driver=browser, timeout=5).until(
lambda x: x.find_elements_by_xpath('//div[@class="top-card-middle"]/div[@class="card-middle-score"]/p')
# elements多个,所有符合条件的WebElement,封装为list[]
)
for item in search_results3:
print(item.text)
dict3.append(item.text)
search_results4 = WebDriverWait(driver=browser, timeout=5).until(
lambda x: x.find_elements_by_xpath('//div[@class="top-card-middle"]/div[@class="card-middle-category"]/a')
# elements多个,所有符合条件的WebElement,封装为list[]
)
for item in search_results4:
# print(item.text)
dict7.append(item.text)
print(len(dict1), len(dict2), len(dict3), len(dict7),len(dict8))
dict = {"榜单":name_list[3],"排名": dict8, "游戏": dict1, "厂商": dict2, "评分": dict3, "类型": dict7}
df = pd.DataFrame(dict)
print(df)
df_list.append(df)
time.sleep(20)
url =url_list[name_list[4]]
time.sleep(10)
try:
browser.get(url=url)
except:
pass
for a in [1, 2, 3, 4]:
time.sleep(10)
js = "window.scrollTo(0,document.body.scrollHeight)"
browser.execute_script(js)
search_button = WebDriverWait(driver=browser, timeout=100).until(
lambda x: x.find_element_by_xpath('//button[@class="btn btn-primary btn-lg"]')
)
try:
search_button.click()
print(search_button)
except:
pass
time.sleep(10)
search_results1 = WebDriverWait(driver=browser, timeout=5).until(
lambda x: x.find_elements_by_xpath('//div[@class="top-card-middle"]/a/h4')
# elements多个,所有符合条件的WebElement,封装为list[]
)
dict1 = []
dict2 = []
dict3 = []
dict4 = []
dict5 = []
dict6 = []
dict7 = []
dict8 = []
n = 0
for item in search_results1:
# print(item.text)
n = n + 1
dict1.append(item.text)
dict8.append(n)
search_results2 = WebDriverWait(driver=browser, timeout=5).until(
lambda x: x.find_elements_by_xpath('//div[@class="top-card-middle"]/p/a')
# elements多个,所有符合条件的WebElement,封装为list[]
)
for item in search_results2:
# print(item.text)
dict2.append(item.text)
search_results3 = WebDriverWait(driver=browser, timeout=5).until(
lambda x: x.find_elements_by_xpath('//div[@class="top-card-middle"]/div[@class="card-middle-score"]/p')
# elements多个,所有符合条件的WebElement,封装为list[]
)
for item in search_results3:
print(item.text)
dict3.append(item.text)
search_results4 = WebDriverWait(driver=browser, timeout=5).until(
lambda x: x.find_elements_by_xpath('//div[@class="top-card-middle"]/div[@class="card-middle-category"]/a')
# elements多个,所有符合条件的WebElement,封装为list[]
)
for item in search_results4:
# print(item.text)
dict7.append(item.text)
print(len(dict1), len(dict2), len(dict3), len(dict7),len(dict8))
dict = {"榜单":name_list[4],"排名": dict8, "游戏": dict1, "厂商": dict2, "评分": dict3, "类型": dict7}
df = pd.DataFrame(dict)
print(df)
df_list.append(df)
browser.quit() #关闭浏览器
# 拼接
df_end=pd.concat([df_list[0],df_list[1],df_list[2],df_list[3],df_list[4]],axis=0,join="inner",ignore_index=True)
print(df_end)
df_end.to_csv("C:/Users/wzq1643/Desktop/taptapgame.csv",encoding="utf_8_sig")
结果如下: