1 '''
2 主页:3 图标地址、下载次数、大小、详情页地址4
5 详情页:6 游戏名、好评率、评论数、小编点评、下载地址、简介、网友评论、1-5张截图链接地址、7 https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=1&ctoken=FRsWKgWBqMBZLdxLaK4iem9B8
9 https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=2&ctoken=FRsWKgWBqMBZLdxLaK4iem9B10
11 https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=3&ctoken=FRsWKgWBqMBZLdxLaK4iem9B12
13 3214 '''
15 importrequests16 from bs4 importBeautifulSoup17 from pymongo importMongoClient18 importre19
20 #连接mongoDB数据库
21 client=MongoClient('localhost',27017)22 #主页信息
23 index_col=client['wandoujia']['index']24 #详情页信息
25 detail_col=client['wandoujia']['detail']26
27 #1、发送请求
28 defget_page(url):29 response =requests.get(url)30 returnresponse31
32 #2、开始解析
33 #解析详情页
34 defparse_detail(text):35 soup = BeautifulSoup(text, 'lxml')36 #print(soup)
37
38 #app名称
39 try:40 name = soup.find(name="span", attrs={"class": "title"}).text41 exceptException:42 name=None43 #print(name)
44
45 #好评率
46 try:47 love = soup.find(name='span', attrs={"class": "love"}).text48 exceptException:49 love =None50 #print(love)
51
52 #评论数
53 try:54 commit_num = soup.find(name='a', attrs={"class": "comment-open"}).text55 exceptException:56 commit_num =None57 #print(commit_num)
58
59 #小编点评
60 try:61 commit_content = soup.find(name='div', attrs={"class": "con"}).text62 exceptException:63 commit_content =None64 #print(commit_content)
65
66 #app下载链接
67 try:68 download_url = soup.find(name='a', attrs={"class": "normal-dl-btn"}).attrs['href']69 exceptException:70 download_url =None71 #print(download_url)
72
73 print('''
74 ============= tank ==============75 app名称:{name}76 好评率: {love}77 评论数: {commit_num}78 小编点评: {commit_content}79 app下载链接: {download_url}80 ============= end ==============81 '''.format(name='name',love='love',commit_num='commit_num',commit_content='commit_content',download_url='download_url')82 )83
84 #判断所有数据都存在,正常赋值
85 if name and love and commit_num and commit_content anddownload_url:86 detail_data={87 'name':name,88 'love':love,89 'commit_num':commit_num,90 'commit_content':commit_content,91 'download_url':download_url,92 }93
94 #若love没有值,则设置为 没人点赞,很惨
95 if notlove:96 detail_data ={97 'name': name,98 'love': "没人点赞,很惨",99 'commit_num':commit_num,100 'commit_content':commit_content,101 'download_url':download_url102 }103
104 #若download_url没有值,则设置为 没有安装包
105 if notlove:106 detail_data ={107 'name':name,108 'love':love,109 'commit_num': commit_num,110 'commit_content': commit_content,111 'download_url': "没有安装包",112 }113
114 #插入详情页数据
115 detail_col.insert(detail_data)116 print('{name}app数据插入成功!')117
118
119
120
121 #解析主页
122 defparse_index(data):123 soup = BeautifulSoup(data, 'lxml')124
125 #获取所有app的li标签
126 app_list = soup.find_all(name='li', attrs={"class": "card"})127 for app inapp_list:128 #print(app)
129 #print('tank' * 1000)
130 #print('tank *' * 1000)
131 #print(app)
132 #图标地址
133 #获取第一个img标签中的data-original属性
134 img = app.find(name='img').attrs['data-original']135 print(img)136
137 #下载次数
138 #获取class为install-count的span标签中的文本
139 down_num = app.find(name='span', attrs={"class": "install-count"}).text140 print(down_num)141
142
143 #大小
144 #根据文本正则获取到文本中包含 数字 + MB(\d+代表数字)的span标签中的文本
145 size = soup.find(name='span', text=re.compile("\d+MB")).text146 print(size)147
148 #详情页地址
149 #获取class为detail-check-btn的a标签中的href属性
150 #detail_url = soup.find(name='a', attrs={"class": "name"}).attrs['href']
151 #print(detail_url)
152
153 #详情页地址
154 detail_url = app.find(name='a').attrs['href']155 print(detail_url)156
157 #拼接数据
158 index_data ={159 'img': img,160 'down_num': down_num,161 'size': size,162 'detail_url': detail_url,163 }164
165 #插入数据
166 index_col.insert(index_data)167 print('主页数据插入成功!')168
169 #3、往app详情页发送请求
170 response =get_page(detail_url)171
172 #4、解析app详情页
173 parse_detail(response.text)174
175
176 defmain():177 for line in range(1, 33):178 url = "https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page={line}&ctoken=FRsWKgWBqMBZLdxLaK4iem9B"
179
180 #1、往app接口发送请求
181 response =get_page(url)182 #print(response.text)
183 print('*' * 1000)184 #反序列化为字典
185 data =response.json()186
187 #获取接口中app标签数据
188 app_li = data['data']['content']189 #print(app_li)
190 #2、解析app标签数据
191 parse_index(app_li)192
193 #执行完所有函数关闭mongoDB客户端
194 client.close()195
196 if __name__ == '__main__':197 main()