今天尝试获取淘宝数据,搜索关键字“按摩椅”,获取商品的链接、淘宝店铺名称、型号、价格、销量、发货地等等信息
本文仅限交流学习用,请勿用于商业或违法行为,请勿高频访问
如需要有数据获取需求请于后台联系
发消息底部菜单可以添加号主VX私聊,添加请注明来意
界面展示
页面分析
效果展示
保存到本地
需要源码/数据采集的朋友
上代码
import re
import json
import random
import hashlib
import pymongo
import redis
import asyncio
import threading
class TB(object):
def __init__(self):
self.client = pymongo.MongoClient(host='127.0.0.1', port=27017)
self.conn = self.client['spider']['xxxx']
self.self_redis = redis.Redis()
self.page = WebPage()
self.count = 0
self.max_count = 200
def save_data(self, _item):
value = get_md5(_item['band'])
res = self.self_redis.sadd('filter:xxxx', value)
if res:
self.conn.insert_one(_item)
self.count += 1
print(f"插入数据成功, 已保存{self.count}条")
else:
print("数据校验重复")
async def click_button(self):
print('next_page')
while self.count < self.max_count:
btn = self.page('@xx=xxxl xxxinatxxm next-next')
btn.click()
await asyncio.sleep(random.randint(5500, 6600) / 1000)
def run(self):
parse_data_thread = threading.Thread(target=self.parse_data)
parse_data_thread.start()
loop = asyncio.get_event_loop()
loop.run_until_complete(self.click_button())
parse_data_thread.join()
print('Ending')
self.page.quit()
def parse_data(self):
page = self.page
url = "https://s.taobao.com/search?xxxx=utf-8&xxxx=all&ie=utf8&ixxxx%89xxxx5&sxxxxhua%2Fa.201856.d13xxxxxx%E7%89%B9"
comment_url = "taobao.com/h5xxd.x.0/"
page.get(url)
page.listen.start(comment_url)
while self.count < self.max_count:
print('监听循环中,开始解析')
res = page.listen.wait()
res_str = res.response.body
try:
position = res_str.find('(')
.....
result = res_str[start_len:-end_len] # print(result)
data = json.loads(result)
itemsArray = data['data']['itemsArray']
print(f'array获取成功{itemsArray}')
for item in itemsArray:
try:
_title = item['title'] if item['shopInfo'] else ''
.....
price = item['priceShow']['price'] if item['priceShow']['price'] else ''
realSales = item['realSales'] if item['realSales'] else ''
procity = item['procity'] if item['procity'] else ''
item_save = {
"band": band,
......
"procity": procity,
}
self.save_data(item_save)
except Exception:
print('解析报错')
except Exception:
print('其他报错')
if __name__ == '__main__':
tb = TB()
tb.run()