首先,我们要安装requests库及lxml
pip install requests
pip install lxml
下载好之后开始,爬取4399网站
通过,requests的get方法请求页面,设置UA来模拟浏览器请求
# 导入,模块
import requests
from lxml import etree
# 要抓取页面的路由
url = 'http://www.4399.com'
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
}
# 请求页面,并返回 响应
response = requests.get(url,headers=headers)
# 将响应的结果转成str,两种方式
# html = response.text
html = response.content.decode(response.apparent_encoding)
print(html)
这样我们就获取到了页面的源码
接下来就是提取数据
确定页面元素的标签属性,利用lxml的xpath进行对数据的提取
# 解析数据
html = etree.HTML(html)
game_titles = html.xpath('//div[@class="mi-lr"]')
# print(game_titles,len(game_titles))
for tit in game_titles:
# 获取游戏的类型
title = tit.xpath('./a[@class="mi_tit"]/text()')[0]
# 获取游戏类型的地址
if title == '专辑':
title_url = tit.xpath('./a[@class="mi_tit"]/@href')[0]
else:
title_url = url + tit.xpath('./a[@class="mi_tit"]/@href')[0]
print(title,':',title_url)
with open('./data/4399.txt', 'a', encoding='utf-8') as f:
f.write(title + ':' + title_url + '\n' + '-' * 50 + '\n')
print('-' * 50)
# 获取类型对应的游戏
games = tit.xpath('./div[@class="mi_d"]')[0]
for game in games:
game_name = game.xpath('./a/text()')
if len(game_name) == 0:
game_name = game.xpath('./a/b/text()')[0]
else:
game_name = game_name[0]
game_url = url + game.xpath('./a/@href')[0]
print(game_name,':',game_url)
with open('./data/4399.txt', 'a',encoding='utf-8') as f:
f.write(game_name+':'+game_url +'\n')
with open('./data/4399.txt', 'a', encoding='utf-8') as f:
f.write("=" * 50 + '\n' + '\n')
print("=" * 50)
print()
good_game_title = html.xpath('//div[@class="tm_fun h_3"]//a[@class="tit_a"]/text()')[0]
good_game_url = url + html.xpath('//div[@class="tm_fun h_3"]//a[@class="tit_a"]/@href')[0]
提取完成,接下来保存数据,
可以将数据保存为csv,txt等格式
with open('./data/4399.txt', 'a', encoding='utf-8') as f:
f.write(good_game_title + ':' +good_game_url + '\n' + '\n')
print(good_game_title + ':' +good_game_url)
good_games = html.xpath('//div[@class="tm_fun h_3"]//li')
print(len(good_games))
for good in good_games:
good_game_name = good.xpath('./a/text()')[0]
good_game_path = url + good.xpath('./a/@href')[0]
print(good_game_name+':'+good_game_path)
with open('./data/4399.txt', 'a', encoding='utf-8') as f:
f.write(good_game_name+':'+good_game_path + '\n' + '\n')