Python项目实例——用代理IP爬取1688商品

拿取商品信息的名称:

id = '自增ID'
task_id = '任务ID'
platform = '平台'
search_key = '搜索词'
goods_id = '商品ID'
goods_title = '商品标题'
goods_url = '商品链接'
goods_brand = '商品品牌'
goods_now_price = '商品优惠价'
goods_old_price = '商品原价'
month_sale_num = '30天销量'
discount_info = '优惠券信息'
place_of_delivery = '发货地'
shop_name = '店铺名称'
shopkeeper_nick = '掌柜昵称'
shop_id = '店铺ID'
shop_level = '店铺等级'
shop_url = '店铺地址'
delivery_score = '物流评分'
item_score = '描述评分'
score_p = '服务评分'
is_authorize = '是否授权'
inv_count = '商品库存量'
fav_count = '商品收藏数'
fans_count = '商品粉丝数'
goods_pic = '商品主图'
sku_att_class = 'sku属性分类'
sku_now_price = 'sku商品价格'
sku_old_price = 'sku商品原价'
sku_inv_count = 'sku商品库存量'
sku_url = 'sku商品链接'
create_at = '创建时间'
update_at = '更新时间'

代码:

import requests
import urllib.parse
import re
import json
from lxml.html import etree
import time
import csv
import random


class Get1688Info(object):
	def __init__(self, task_id=None, search_key=None, id=None):
		self.headers =  {
   
						'authority': 'search.1688.com',
						'origin': 'https://s.1688.com',
						'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
						'accept': '*/*',
						'sec-fetch-site': 'same-site',
						'sec-fetch-mode': 'cors',
						'referer': 'https://s.1688.com/selloffer/offer_search.htm',
						'accept-encoding': 'gzip, deflate, br',
						'accept-language': 'zh-CN,zh;q=0.9',
                        }
		self.user_agents = [
							"Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11",
							"Mozilla/5.0(Macintosh;IntelMacOSX10_7_3)AppleWebKit/535.20(KHTML,likeGecko)Chrome/19.0.1036.7Safari/535.20",
							"Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/536.11(KHTML,likeGecko)Chrome/20.0.1132.11TaoBrowser/2.0Safari/536.11",
							"Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.1(KHTML,likeGecko)Chrome/21.0.1180.71Safari/537.1LBBROWSER",
							"Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.84Safari/535.11LBBROWSER",
							"Mozilla/5.0(WindowsNT5.1)AppleWebKit/537.1(KHTML,likeGecko)Chrome/21.0.1180.89Safari/537.1",
							"Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.1(KHTML,likeGecko)Chrome/21.0.1180.89Safari/537.1",
							"Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.11(KHTML,likeGecko)Chrome/23.0.1271.64Safari/537.11",
							"Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/58.0.3029.110Safari/537.36",
							"Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.1(KHTML,likeGecko)Chrome/22.0.1207.1Safari/537.1",
							]
		self.uuid = id
		self.task_id = task_id
		self.search_key = search_key
		self.platform = '1688'


	def agent_proxies(self):    #获取代理IP
		ip_use = True
		i = 0
		while ip_use:
			try:
				ag_url = '拿代理ip的网链接'
				ip_res = requests.get(ag_url).text.strip()
				resp_json = json.loads(ip_res)['data'][0]
				proxies = f'{resp_json["IP"]}'
				ip_use = self.test_ip(proxies)
				# success = json.loads(ip_res)['success']
				# if success == True:
				# 	resp_json = json.loads(ip_res)['data'][0]
				# 	proxies = f'{resp_json["ip"]}:{resp_json["port"]}'
				# 	ip_use = self.test_ip(proxies)
				# else:
				# 	print('请添加白名单')
				# 	jixu = input('可以继续吗?')
			except:
				if i < 5:
					i += 1
					time.sleep(2)
					pass
				else:
					print('获取代理ip失败,代理网站失效')
					jixu = input('可以继续吗?')
		return proxies


	def test_ip(self,pro):    #测试代理ip是否可用
		try:
			proxies = {
   'http': f'http://{pro}', 'https': f'https://{pro}'}
			res = requests.get('https://www.baidu.com/', timeout=10, proxies=proxies)
			a = res.status_code
			if a == 200:
				print('ip可用')
				ip_use = False
				return ip_use
			else:
				print('代理ip无效')
				time.sleep(2)
				ip_use = True
				return ip_use
		except:
			print('代理IP无效')
			ip_tests = True
			time.sleep(2)
			return ip_use


	#获取商品总页数
	def num_page(self):
		params = {
    'n': 'y',
				   'netType': '1,11,16',
				   'beginPage': '1',
				   'async': 'true',
				   'asyncCount': '20',
				   'pageSize': '60',
				   'startIndex': '0',
				   'offset': '9', }
		response = requests.get(url, headers=self.headers, params=params, timeout=10)  
		goodsinformation_json = response.json()
		num_page = goodsinformation_json['data']['data']['pageCount']
		print(f'商品列表一共{num_page}页')
		self.get_goods_ids(num_page)

	#获取每页商品的ID
	def get_goods_ids(self,num_page):
		goods_id_lists = []
		x = 0
		agent_proxies = self.agent_p
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值