1.创建scrapy项目
安装包
pip install scrapy
然后进入项目文件夹
scrapy startproject 项目名
scrapy genspider 爬虫名 域名
以buff为例
scrapy startproject buff
scrapy genspider buffprice www.xxx.com
创建完成后
2.获取网易buff饰品的api和参数
url:https://buff.163.com/api/market/goods
3.编写爬虫
class BuffpriceSpider(scrapy.Spider):
name = 'buffprice'
urlp='https://buff.163.com/api/market/goods/price_history/buff'
cookies=[{'Device-Id': 'DHupMTHY21dU88DHKKxm', ' Locale-Supported': 'zh-Hans', ' game': 'csgo', ' NTES_YD_SESS': 'sdRnq33P8uTwN2I2T.Axt5ZsW1pp1BTXHPCMK5t9QQ3a8Ftg8d_0OTbKnOw.peSOyyyDJLbZIJUImbyxKp7aaAOMuS2KxTi30YeARv7oz3XyhNlQRrqf.USZjJuDGZpTo2wmqT3iCGlGTbK565Uvfb.cjfSje2TaZmLB4lJOiOOovY2KF0nkpTyzJ7SJvPLjxpXvBFD7fBdYFhXW4LirCUwfLWXubq3aPqUMqUwkouyz.', ' S_INFO': '1681704803|0|0&60##|16581939586', ' P_INFO': '16581939586|1681704803|1|netease_buff|00&99|sic&1681652238&netease_buff#sic&510100#10#0#0|&0|null|16581939586', ' remember_me': 'U1106380554|Pj2YpQ0dSCQgZHWo7BmX3jA2wXRqrE2V', ' session': '1-xOpClokz5XCP2uTasjgKvqzYAfLeb88QyDW32zL9l6HO2034014290', ' csrf_token': 'IjQ3ZmM4OGNlMjdiMDE3OTZlZmE4YWMzOTUxM2NjYWE3YTFlYmIwM2Qi.Fx5Y9A.c7j1TlR1aO4xZydZbWkLJb2f-ik'},{'Device-Id': 'Qd13dM33YjFu5uEyOUVS', ' Locale-Supported': 'zh-Hans', ' game': 'csgo', ' NTES_YD_SESS': 'BERnNqenNhpaucVh8kJsjAPXbrbFeg4pUOEy8neUuuTstMe.tG1Vzrs2hE227KgKO7KpArtQohZQhKNq0weIGbSYTD5aGpmTUyVAUcHRJfJ3Y1Gmxla66AHqmRdLkqFraCIb5rTPEkDkr48nYnQj74JKm7ZmpCrsqbgf0DRzPzzPBvVV4rCHOgKslzOmHgvsstPr34SKbMI938rA0uNuBFcEEhfHag5c1w4NywkGpcnvJ', ' S_INFO': '1681704859|0|0&60##|17030031991', ' P_INFO': '17030031991|1681704859|1|netease_buff|00&99|sic&1681696671&netease_buff#sic&510100#10#0#0|&0|null|17030031991', ' remember_me': 'U1106382607|PQD0AqsSJTaP3alxmZCpFdROPToSdNaY', ' session': '1-myjrLAIqOnbtflxm5vLjJz4ty8l_Wb48tWMRjLKVJMa72034012247', ' csrf_token': 'IjFiZjFlNTZmZDMxZTM2MzdkN2NlZWM0NDhmMDgyMTRjMmYwYWQ3YzEi.Fx5ZIA.MHVgUgEfQOTTXQXd18QiAapUJlM'},{'Device-Id': 'rKHlSmE6bFUmCirkgjDk', ' Locale-Supported': 'zh-Hans', ' game': 'csgo', ' NTES_YD_SESS': 'QFAzQExV0cDcg34ReIIaSG3gAEDH2ouJgcb21vZfHHJUSnZrSDeaKRZjluKgwHQ5xag0LXxXWURcnIUea684IOuph1GduKWywqrMRcJcHuNWjgk2EsSAqBqMxil3LM5R9mVwyRJYbLWLRI1vzv7EdIh_xdFxqmRUMwTuoWiKYKKlPNuZ9QbSzRH71l4dLH7N32M5_2QqH_v6Ad9qPl08as6vE46FR5WM5R04tPOyMFt.h', ' S_INFO': '1681704912|0|0&60##|16537761833', ' P_INFO': '16537761833|1681704912|1|netease_buff|00&99|sic&1681696581&netease_buff#sic&510100#10#0#0|&0|null|16537761833', ' remember_me': 'U1106387923|aqxeUnC6vHMes8UTpb84UL7CRqhzUIJS', ' session': '1-KaY6HG6_1rCK3vhnT8h1Wm1D2-ia45IK-dr4nkE68MsF2034023563', ' csrf_token': 'ImUyOWYzYzUwNzE1ODVkODdhMjM5ODBkNGNlY2NiYzU5ZDgzNGRmNWIi.Fx5ZVw.THhhI3En7ft8vNNBT6K--d5s3qY'}]
a=1
#allowed_domains = ['www.xxx.com']
start_urls = ['https://buff.163.com/api/market/goods?game=csgo&page_num=4&use_suggestion=0&_=1681384999985']
我们从第一页开始爬取所以起始url用第一页的url
由于这些数据都要等录后才能查看所以我们要记录cookie
def parse(self, response):
response=response.json()
res = response['data']['items']
for i in res:
id = i['id']
name = i['name']
appid = i['appid']
sell_num=i['sell_num']
sell_min_price=i['sell_min_price']
self.a=self.a+1
data={
'game': 'csgo',
'page_num': str(self.a) ,
'use_suggestion': '0',
'_': tim()
}
yield scrapy.FormRequest(url='https://buff.163.com/api/market/goods',cookies=random.choice(self.cookies),method='GET',callback=self.parse,formdata=data)
def tim():
t = time.time()
vb = (str(round(t * 1000))) # vb
return str(vb)
我们将数据转换为json格式便于数据的提取
这里tim()函数是用来生成_参数一个毫秒级的时间戳
因为我们要负载一个参数所以这里使用scrapy.FormRequest
method=‘GET’ 声明发送方法
callback=self.parse 声明获取的数据传输给哪个函数处理
formdata=data 类似于request.get中的params resquest.post中的data
cookies= 添加cookie这个参数要将cookie的值反序列化
cookie='Device-Id=rKHlSmE6bFUmCirkgjDk; Locale-Supported=zh-Hans; game=csgo; NTES_YD_SESS=QFAzQExV0cDcg34ReIIaSG3gAEDH2ouJgcb21vZfHHJUSnZrSDeaKRZjluKgwHQ5xag0LXxXWURcnIUea684IOuph1GduKWywqrMRcJcHuNWjgk2EsSAqBqMxil3LM5R9mVwyRJYbLWLRI1vzv7EdIh_xdFxqmRUMwTuoWiKYKKlPNuZ9QbSzRH71l4dLH7N32M5_2QqH_v6Ad9qPl08as6vE46FR5WM5R04tPOyMFt.h; S_INFO=1681704912|0|0&60##|16537761833; P_INFO=16537761833|1681704912|1|netease_buff|00&99|sic&1681696581&netease_buff#sic&510100#10#0#0|&0|null|16537761833; remember_me=U1106387923|aqxeUnC6vHMes8UTpb84UL7CRqhzUIJS; session=1-KaY6HG6_1rCK3vhnT8h1Wm1D2-ia45IK-dr4nkE68MsF2034023563; csrf_token=ImUyOWYzYzUwNzE1ODVkODdhMjM5ODBkNGNlY2NiYzU5ZDgzNGRmNWIi.Fx5ZVw.THhhI3En7ft8vNNBT6K--d5s3qY'
cookies={ data.split('=')[0] : data.split('=')[1] for data in cookie.split(';')}
print(cookies)
4.setting设置
ROBOTSTXT_OBEY = False 改为False
COOKIES_ENABLED = False 解注释
DEFAULT_REQUEST_HEADERS = {
'cookie': 'Device-Id=DHupMTHY21dU88DHKKxm; Locale-Supported=zh-Hans; game=csgo; NTES_YD_SESS=sdRnq33P8uTwN2I2T.Axt5ZsW1pp1BTXHPCMK5t9QQ3a8Ftg8d_0OTbKnOw.peSOyyyDJLbZIJUImbyxKp7aaAOMuS2KxTi30YeARv7oz3XyhNlQRrqf.USZjJuDGZpTo2wmqT3iCGlGTbK565Uvfb.cjfSje2TaZmLB4lJOiOOovY2KF0nkpTyzJ7SJvPLjxpXvBFD7fBdYFhXW4LirCUwfLWXubq3aPqUMqUwkouyz.; S_INFO=1681704803|0|0&60##|16581939586; P_INFO=16581939586|1681704803|1|netease_buff|00&99|sic&1681652238&netease_buff#sic&510100#10#0#0|&0|null|16581939586; remember_me=U1106380554|Pj2YpQ0dSCQgZHWo7BmX3jA2wXRqrE2V; session=1-xOpClokz5XCP2uTasjgKvqzYAfLeb88QyDW32zL9l6HO2034014290; csrf_token=IjQ3ZmM4OGNlMjdiMDE3OTZlZmE4YWMzOTUxM2NjYWE3YTFlYmIwM2Qi.Fx5Y9A.c7j1TlR1aO4xZydZbWkLJb2f-ik',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.34'
}
#为起始请求添加cookie
~~~## 标题