Python爬取豌豆荚软件分类以及下载量
一,查看网页
链接豌豆荚
红框内即为要爬取的元素
二,获取所有页签的地址:
#爬取豌豆荚
import requests
from bs4 import BeautifulSoup
import pandas as pd
#获取各个分类的url
data = requests.get('https://www.wandoujia.com/category/app')
s = BeautifulSoup(data.text, "html.parser")
divs = [li.div.find_all('a') for li in s.find_all('div')[4].find_all('ul')[0].find_all('li')]
urls_dict = {}
for i in range(len(divs)):
#print(divs[i])
for j in range(len(divs[i])):
title = divs[i][j].attrs['title']
url = divs[i][j].attrs['href']
urls_dict[title] = url
{‘视频’: ‘https://www.wandoujia.com/category/5029_716’,
‘直播’: ‘https://www.wandoujia.com/category/5029_1006’,
‘音乐’: ‘https://www.wandoujia.com/category/5029_722’,
‘K歌’: ‘https://www.wandoujia.com/category/5029_718’,
‘铃声’: ‘https://www.wandoujia.com/category/5029_719’,
‘收音机’: ‘https://www.wandoujia.com/category/5029_837’,
‘WiFi’: ‘https://www.wandoujia.com/category/5018_895’,
‘浏览器’: ‘https://www.wandoujia.com/category/5018_599’,
‘输入法’: ‘https://www.wandoujia.com/category/5018_597’,
‘优化’: ‘https://www.wandoujia.com/category/5018_596’,
‘省电’: ‘https://www.wandoujia.com/category/5018_601’,
‘安全’: ‘https://www.wandoujia.com/category/5018_598’,
‘Root’: ‘https://www.wandoujia.com/category/5018_947’,
‘文件管理’: ‘https://www.wandoujia.com/category/5018_948’,
‘聊天’: ‘https://www.wandoujia.com/category/5014_710’,
‘交友’: ‘https://www.wandoujia.com/category/5014_713’,
‘电话通讯’: ‘