一、免费IP采集(几乎没有有效IP)
import requests
import time
from lxml import etree
import pymongo
import random
class KuaiDaiLi:
def __init__(self):
self.db = pymongo.MongoClient(host='localhost', port=27017)
self.collection = self.db.python.IP
self.url = 'https://www.kuaidaili.com/free/inha/{}/'
self.test_url = 'http://httpbin.org/ip'
self.headers = {
'User-Agent': 'xxx'
}
def get_data(self):
for i in range(155, 256):
response = requests.get(self.url.format(i), headers=self.headers)
html = response.text
self.parse_data(html)
time.sleep(5/random.randint(3, 4))
def parse_data(self, html):
element = etree.HTML(html)
trs = element.xpath('//*[@id="list"]/table/tbody/tr[position()>1]')
for tr in trs:
item = {}
ip = tr.xpath('./td[@data-title="IP"]/text()')[0]
port = tr.xpath('./td[@data-title="PORT"]/text()')[0]
item['anonymous'] = tr.xpath('./td[@data-title="匿名度"]/text()')[0]
item['style'] = tr.xpath('./td[@data-title="类型"]/text()')[0]
item['localtion'] = tr.xpath('./td[@data-title="位置"]/text()')[0]
item['speed'] = tr.xpath('./td[@data-title="响应速度"]/text()')[0]
item['time'] = tr.xpath('./td[@data-title="最后验证时间"]/text()')[0]
item['pay'] = tr.xpath('./td[@data-title="付费方式"]/text()')[0]
# print(item)
self.ip_verificate(ip, port, item)
def ip_verificate(self, ip, port, item):
proxies = {'http': 'http://' + ip + ':' + port}
try:
response = requests.get(self.test_url, headers=self.headers, proxies=proxies, timeout=2)
if response.status_code == 200:
item['ip'] = ip
item['port'] = port
print(response.text, item)
self.save_data(item)
else:
print(f'{ip}:{port},状态码:', response.status_code)
except Exception as e:
print('ConnectTimeoutError', ip)
def save_data(self, item):
self.db.insert_one(item)
print('插入成功')
def main(self):
self.get_data()
self.db.close()
if __name__ == '__main__':
kdl_IP = KuaiDaiLi()
kdl_IP.main()
二、付费IP应用
import requests
import pymysql
import threading
from lxml import etree
from queue import Queue
import time
from pprint import pprint
import re
from retrying import retry
from feapder.network.user_agent import get
from loguru import logger
class Amazon:
def __init__(self):
self.base_url = 'https://www.amazon.cn/nav/ajax/hamburgerMainContent?ajaxTemplate=hamburgerMainContent&pageType=Gateway&hmDataAjaxHint=1&navDeviceType=desktop&isSmile=0&isPrime=0&isBackup=false&hashCustomerAndSessionId=c108bde04b677f19f2e5d7df74ff6ce0cad515fc&languageCode=zh_CN&environmentVFI=AmazonNavigationCards%2Fdevelopment%40B6122949553-AL2_x86_64&secondLayerTreeName=apparel_shoes%2Bcomputer_office%2Bhome_kitchen%2Bbeauty_pca%2Bkindle_ebook%2Bsports_outdoor%2Bgrocery%2Bbaby_toy%2Bphones_elec%2Bjewelry_watch%2Bhome_improvement%2Bvideo_game%2Bmusical_instrument%2Bcamera&customerCountryCode=null'
self.headers = {
"Connection": "keep-alive",
"downlink": "10",
"ect": "4g",
"rtt": "50",
"User-Agent": "xxx",
}
self.db = pymysql.connect(user='root', password='12345', host='localhost', database='python', port=3306, charset='utf8')
self.cursor = self.db.cursor()
self.ip_url = '获取付费ip接口'
self.ip_queue = Queue()
self.classification_info_queue = Queue()
self.good_info_queue = Queue()
self.save_queue = Queue()
def get_ip(self):
while True:
if self.ip_queue.empty():
response = requests.get(self.ip_url)
print(response.text)
self.ip_queue.put(response.text)
else:
continue
@retry(stop_max_attempt_number=3)
def test_ip(self, url):
ip = self.ip_queue.get()
proxies = {'http': 'http:' + ip}
self.headers['User-Agent'] = get()
response = requests.get(url, headers=self.headers, proxies=proxies, timeout=2)
assert response.status_code == 200, '状态码错误'
self.ip_queue.put(ip)
return response
def get_secondary_classification(self):
response = self.test_ip(self.base_url)
# pprint(response.json())
html = response.json()['data']
element = etree.HTML(html)
li_list = element.xpath('//ul/li[position() > 2]')
for li in li_list:
classification_info = {}
if li.xpath('./a/text()'):
if '全部' in li.xpath('./a/text()')[0]:
continue
if 'http' in li.xpath('./a/@href')[0]:
continue
classification_info['classification_name'] = li.xpath('./a/text()')[0]
classification_href = li.xpath('./a/@href')[0]
classification_info['classification_keyid'] = re.findall('.*?node=(.*?)&ref_=.*?', classification_href)[0]
self.classification_info_queue.put(classification_info)
# print(classification_info)
def get_goods_info(self):
while True:
classification_info = self.classification_info_queue.get()
start_url = f"https://www.amazon.cn/s?rh=n%3A{classification_info['classification_keyid']}&fs=true"
# try:
# response = self.test_ip(start_url)
# except Exception as e:
# logger.error(start_url)
# continue
# element = etree.HTML(response.text)
# max_index = element.xpath('//span[@class="s-pagination-strip"]/span[last()]/text()')[0]
# for i in range(1, int(max_index) + 1):
for i in range(1, 11):
goods_url = start_url + f"&page={i}"
try:
response = self.test_ip(goods_url)
except Exception as e:
logger.error('商品列表页', goods_url)
continue
element = etree.HTML(response.text)
divs = element.xpath('//span/div[@class="s-main-slot s-result-list s-search-results sg-row"]/div[@data-component-type="s-search-result"]')
for div in divs:
good_info = {}
good_info['classification_name'] = classification_info['classification_name']
good_info['goods_url'] = goods_url
h = div.xpath('.//div/h2/a[@class="a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal"]/@href')
# print(good_info)
if h:
good_info['good_url'] = 'https://www.amazon.cn' + h[0]
self.good_info_queue.put(good_info)
else:
continue
self.classification_info_queue.task_done()
def get_good_data(self):
while True:
good_info = self.good_info_queue.get()
try:
response = self.test_ip(good_info['good_url'])
except Exception as e:
logger.error('商品详情页', good_info['good_url'])
continue
element = etree.HTML(response.text)
good_title = element.xpath('//div[@id="centerCol"]//h1/span/text()')[0] if element.xpath('//div[@id="centerCol"]//h1/span/text()') else element.xpath('//title/text()')[0]
good_price = element.xpath('//div[@class="a-section a-spacing-none aok-align-center"]/span/span[@class="a-offscreen"]/text()')[0] \
if element.xpath('//div[@class="a-section a-spacing-none aok-align-center"]/span/span[@class="a-offscreen"]/text()') \
else '-'.join(element.xpath('//td[@class="a-span12"]//span[@class="a-offscreen"]/text()'))
good_tup = (good_info['classification_name'], good_info['goods_url'], good_info['good_url'], good_title, good_price)
print(good_tup)
self.save_queue.put(good_tup)
self.good_info_queue.task_done()
def create_table(self):
sql = """
create table if not exists amazon(
id int unsigned primary key auto_increment,
classification_name varchar(50),
goods_url varchar(400),
good_url varchar(400),
good_title varchar(400),
good_price varchar(30)
)
"""
try:
self.cursor.execute(sql)
print('表创建成功')
except Exception as e:
print('表创建失败', repr(e))
def save_data(self):
while True:
data_list = []
for i in range(30):
try:
data = self.save_queue.get(timeout=3)
data_list.append((0,) + data)
self.save_queue.task_done()
except Exception as e:
print('队列消息数量不足30', repr(e))
break
sql = """
insert into amazon(id, classification_name, goods_url, good_url, good_title, good_price)
values(%s, %s, %s, %s, %s, %s)
"""
try:
self.cursor.executemany(sql, data_list)
self.db.commit()
print('保存成功')
except Exception as e:
self.db.rollback()
print('保存失败')
def main(self):
self.create_table()
threads = []
t_ip = threading.Thread(target=self.get_ip)
threads.append(t_ip)
t_classification = threading.Thread(target=self.get_secondary_classification)
threads.append(t_classification)
for i in range(6):
t_goods = threading.Thread(target=self.get_goods_info)
threads.append(t_goods)
for i in range(6):
t_good = threading.Thread(target=self.get_good_data)
threads.append(t_good)
t_save = threading.Thread(target=self.save_data)
threads.append(t_save)
for t in threads:
t.setDaemon(True)
t.start()
time.sleep(2)
for q in [self.good_info_queue, self.good_info_queue, self.save_queue]:
q.join()
self.db.close()
if __name__ == '__main__':
# 文件大于500M就会重新生成一个文件
logger.add("runtime_{time}.log", rotation="500 MB")
amazon = Amazon()
amazon.main()
'''
不使用代理
'''
import requests
import pymysql
import threading
from lxml import etree
from queue import Queue
import random
import time
from pprint import pprint
import re
from feapder.network.user_agent import get
class Amazon:
def __init__(self):
self.base_url = 'https://www.amazon.cn/nav/ajax/hamburgerMainContent?ajaxTemplate=hamburgerMainContent&pageType=Gateway&hmDataAjaxHint=1&navDeviceType=desktop&isSmile=0&isPrime=0&isBackup=false&hashCustomerAndSessionId=c108bde04b677f19f2e5d7df74ff6ce0cad515fc&languageCode=zh_CN&environmentVFI=AmazonNavigationCards%2Fdevelopment%40B6122949553-AL2_x86_64&secondLayerTreeName=apparel_shoes%2Bcomputer_office%2Bhome_kitchen%2Bbeauty_pca%2Bkindle_ebook%2Bsports_outdoor%2Bgrocery%2Bbaby_toy%2Bphones_elec%2Bjewelry_watch%2Bhome_improvement%2Bvideo_game%2Bmusical_instrument%2Bcamera&customerCountryCode=null'
self.headers = {
"Connection": "keep-alive",
"downlink": "10",
"ect": "4g",
"rtt": "50",
"User-Agent": "xxx",
}
self.db = pymysql.connect(user='root', password='12345', host='localhost', database='python', port=3306, charset='utf8')
self.cursor = self.db.cursor()
self.ip_url = ''
self.classification_info_queue = Queue()
self.good_info_queue = Queue()
self.save_queue = Queue()
def get_secondary_classification(self):
self.headers['User-Agent'] = get()
response = requests.get(self.base_url, headers=self.headers)
# pprint(response.json())
html = response.json()['data']
element = etree.HTML(html)
li_list = element.xpath('//ul/li[position() > 2]')
for li in li_list:
classification_info = {}
if li.xpath('./a/text()'):
if '全部' in li.xpath('./a/text()')[0]:
continue
if 'http' in li.xpath('./a/@href')[0]:
continue
classification_info['classification_name'] = li.xpath('./a/text()')[0]
classification_href = li.xpath('./a/@href')[0]
classification_info['classification_keyid'] = re.findall('.*?node=(.*?)&ref_=.*?', classification_href)[0]
self.classification_info_queue.put(classification_info)
# print(classification_info)
def get_goods_info(self):
while True:
classification_info = self.classification_info_queue.get()
start_url = f"https://www.amazon.cn/s?rh=n%3A{classification_info['classification_keyid']}&fs=true"
# try:
# response = self.test_ip(start_url)
# except Exception as e:
# logger.error(start_url)
# continue
# element = etree.HTML(response.text)
# max_index = element.xpath('//span[@class="s-pagination-strip"]/span[last()]/text()')[0]
# for i in range(1, int(max_index) + 1):
for i in range(1, 2):
self.headers['User-Agent'] = get()
goods_url = start_url + f"&page={i}"
response = requests.get(goods_url, headers=self.headers)
element = etree.HTML(response.text)
divs = element.xpath('//span/div[@class="s-main-slot s-result-list s-search-results sg-row"]/div[@data-component-type="s-search-result"]')
for div in divs:
good_info = {}
good_info['classification_name'] = classification_info['classification_name']
good_info['goods_url'] = goods_url
h = div.xpath('.//div/h2/a[@class="a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal"]/@href')
# print(good_info)
if h:
good_info['good_url'] = 'https://www.amazon.cn' + h[0]
self.good_info_queue.put(good_info)
else:
continue
self.classification_info_queue.task_done()
def get_good_data(self):
while True:
time.sleep(random.uniform(1, 2))
self.headers['User-Agent'] = get()
good_info = self.good_info_queue.get()
response = requests.get(good_info['good_url'], headers=self.headers)
# print(response.text)
element = etree.HTML(response.text)
good_title = element.xpath('//div[@id="centerCol"]//h1/span/text()')[0] if element.xpath('//div[@id="centerCol"]//h1/span/text()') else element.xpath('//title/text()')[0]
good_price = element.xpath('//div[@class="a-section a-spacing-none aok-align-center"]/span/span[@class="a-offscreen"]/text()')[0] \
if element.xpath('//div[@class="a-section a-spacing-none aok-align-center"]/span/span[@class="a-offscreen"]/text()') \
else '-'.join(element.xpath('//td[@class="a-span12"]//span[@class="a-offscreen"]/text()'))
good_tup = (good_info['classification_name'], good_info['goods_url'], good_info['good_url'], good_title, good_price)
print('@'*50, good_info['good_url'], good_title, good_price)
self.save_queue.put(good_tup)
self.good_info_queue.task_done()
def create_table(self):
sql = """
create table if not exists amazon(
id int unsigned primary key auto_increment,
classification_name varchar(50),
goods_url varchar(400),
good_url varchar(400),
good_title varchar(400),
good_price varchar(30)
)
"""
try:
self.cursor.execute(sql)
print('表创建成功')
except Exception as e:
print('表创建失败', repr(e))
def save_data(self):
while True:
data_list = []
for i in range(30):
try:
data = self.save_queue.get(timeout=3)
data_list.append((0,) + data)
self.save_queue.task_done()
except:
break
sql = """
insert into amazon(id, classification_name, goods_url, good_url, good_title, good_price)
values(%s, %s, %s, %s, %s, %s)
"""
try:
self.cursor.executemany(sql, data_list)
self.db.commit()
print('保存成功')
except Exception as e:
self.db.rollback()
print('保存失败')
def main(self):
self.create_table()
threads = []
# t_ip = threading.Thread(target=self.get_ip)
# threads.append(t_ip)
t_classification = threading.Thread(target=self.get_secondary_classification)
threads.append(t_classification)
for i in range(6):
t_goods = threading.Thread(target=self.get_goods_info)
threads.append(t_goods)
for i in range(6):
t_good = threading.Thread(target=self.get_good_data)
threads.append(t_good)
t_save = threading.Thread(target=self.save_data)
threads.append(t_save)
for t in threads:
t.setDaemon(True)
t.start()
time.sleep(2)
for q in [self.good_info_queue, self.good_info_queue, self.save_queue]:
q.join()
self.db.close()
if __name__ == '__main__':
amazon = Amazon()
amazon.main()