原帖见【爬虫】Python使用动态IP,多线程,爬取uncomtrade的数据_学金融的程序员懒羊羊的博客-CSDN博客_爬虫动态ip
原帖url以及自定义函数代码有小问题,本贴修改了url代码,自定义函数的递归问题,跑通代码。
需要自己去ip代理平台购买IP代理。
# coding=gbk
import requests
import pandas as pd
from random import randint
import datetime
from io import StringIO
import threading
import os
class proxy:
proxyHost =
proxyPort =
proxyUser =
proxyPass =
user_agents = [
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.0 Safari/537.36",
]
proxies = {}
def __init__(self, proxyHost, proxyPort, proxyUser, proxyPass, user_agents):
self.proxyHost = proxyHost
self.proxyPort = proxyPort
self.proxyUser = proxyUser
self.proxyPass = proxyPass
self.user_agents = user_agents
proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
"host": proxyHost,
"port": proxyPort,
"user": proxyUser,
"pass": proxyPass,
}
self.proxies = {
"http": proxyMeta,
"https": proxyMeta,
}
# 错误信息写到csv里,爬到的数据是返回(保存在内存里)
def download_url(url, ifuse_proxy=False, proxy=None):
if (ifuse_proxy):
random_agent = proxy.user_agents[
randint(0, len(proxy.user_agents) - 1)] # chose an user agent from the user agent list above
tunnel = randint(1, 10000) # generate a tunnel 动态tunnel?
header = {
"Proxy-Tunnel": str(tunnel),
"User-Agent": random_agent
}
# print(header,proxy.proxies)
try:
if (ifuse_proxy):
content = requests.get(url, timeout=100, headers=header, proxies=proxy.proxies)
else:
content = requests.get(url, timeout=100, proxies=proxy)
''' note that sometimes we only get error informations in the responses, and here are some really dumb quick fixes'''
if (
content.text == "<html><body><h1>502 Bad Gateway</h1>\nThe server returned an invalid or incomplete response.\n</body></html>\n"
or content.text == "Too Many Requests.\n" or content.text == "{\"Message\":\"An error has occurred.\"}"
or ' <h2>403 - Forbidden: Access is denied.</h2>' in content.text
or "Proxy Bad Server" in content.text):
with open("./uncomtrade_data/serverError.csv", 'a', encoding="utf-8") as log:
log.write(str(datetime.datetime.now()) + "," + str(url) + "\n")
# print("\n" + content.content.decode())
print(str(url) + ' 403 forbidden, retrying')
if (ifuse_proxy):
return download_url(url, ifuse_proxy=True, proxy=proxy)
else:
return download_url(url, ifuse_proxy=False, proxy=None)
elif (
'No data matches your query or your query is too complex. Request JSON or XML format for more information.' in content.text):
print(str(url) + ' no data')
else:
# if ('json' in url):
# return json_normalize(content.json()['dataset'])
# elif ('csv' in url):
dataframe = pd.read_csv(StringIO(content.text), on_bad_lines='skip')
# print(content.text)
return dataframe
except requests.RequestException as e:
print(type(e).__name__ + " has occurred, change proxy!")
# if(type(e).__name__=='JSONDecodeError'):
# print(content.content)
with open("./uncomtrade_data/exp.csv", 'a', encoding="utf-8") as log:
log.write(
str(datetime.datetime.now()) + "," + str(type(e).__name__) + "," + str(url) + "\n")
if (ifuse_proxy):
return download_url(url, ifuse_proxy=True, proxy=proxy)
else:
return download_url(url, ifuse_proxy=False, proxy=None)
def get_data_un_comtrade(max_un=100000, r='156', freq='A', ps='2021', px='S1', p='all', rg='1', cc='TOTAL', fmt='csv',
type_un='C', ifuse_proxy=False, proxy=None):
'''
max:最大返回数据量(默认为100000);
r:reportering area,选择所需要的目标国家;
freq:选择数据为年度或月度(A,M);
ps:选择所需要的年份;
px:选择分类标准,如常用的SITC Revision 3为S3;
p:partner area,选择所需要的对象国家,如需要中国与俄罗斯的出口额,则目标为中国,对象为俄罗斯;
rg:选择进口或出口(进口为1,出口为2);
cc:选择产品代码;
fmt:选择输出文件格式,csv或json,默认使用json(实测中csv更快);
type:选择贸易类型,产品或服务;
ifuse_proxy:是否使用代理;
proxy:代理信息。
return:{数据名称: 数据}{str:dataframe}
'''
pre_url = "http://comtrade.un.org/api/get?max={}&type={}&freq={}&px={}&ps={}&r={}&p={}&rg={}&cc={}&fmt={}"
url_use = pre_url.format(max_un, type_un, freq, px, ps, r, p, rg, cc, fmt)
print("Getting data from:" + url_use)
data = download_url(url_use, ifuse_proxy=ifuse_proxy, proxy=proxy)
if (rg == 1):
ex_or_in = 'IMPORT'
else:
ex_or_in = 'EXPORT'
data_name = ps + "_" + r + "_" + p + "_" + px + "_" + cc + "_" + ex_or_in + "_" + freq
return {data_name: data}
countries = {'156': 'China', '344': 'China, Hong Kong SAR', '446': 'China, Macao SAR',
'4': 'Afghanistan', '8': 'Albania', '12': 'Algeria', '20': 'Andorra', '24': 'Angola', '660': 'Anguilla',
'28': 'Antigua and Barbuda', '32': 'Argentina', '51': 'Armenia',
'533': 'Aruba', '36': 'Australia', '40': 'Austria', '31': 'Azerbaijan', '44': 'Bahamas', '48': 'Bahrain',
'50': 'Bangladesh', '52': 'Barbados',
'112': 'Belarus', '56': 'Belgium', '58': 'Belgium-Luxembourg', '84': 'Belize', '204': 'Benin',
'60': 'Bermuda', '64': 'Bhutan', '68': 'Bolivia (Plurinational State of)', '535': 'Bonaire',
'70': 'Bosnia Herzegovina', '72': 'Botswana', '92': 'Br. Virgin Isds', '76': 'Brazil',
'96': 'Brunei Darussalam', '100': 'Bulgaria', '854': 'Burkina Faso', '108': 'Burundi', '132': 'Cabo Verde',
'116': 'Cambodia',
'120': 'Cameroon', '124': 'Canada', '136': 'Cayman Isds', '140': 'Central African Rep.', '148': 'Chad',
'152': 'Chile',
'170': 'Colombia', '174': 'Comoros', '178': 'Congo', '184': 'Cook Isds', '188': 'Costa Rica',
'384': "Cote d'Ivoire", '191': 'Croatia', '192': 'Cuba', '531': 'Curaao', '196': 'Cyprus',
'203': 'Czechia',
'200': 'Czechoslovakia', '408': "Dem. People's Rep. of Korea", '180': 'Dem. Rep. of the Congo',
'208': 'Denmark', '262': 'Djibouti', '212': 'Dominica', '214': 'Dominican Rep.', '218': 'Ecuador',
'818': 'Egypt', '222': 'El Salvador', '226': 'Equatorial Guinea', '232': 'Eritrea', '233': 'Estonia',
'231': 'Ethiopia', '234': 'Faeroe Isds', '238': 'Falkland Isds (Malvinas)', '242': 'Fiji',
'246': 'Finland',
'251': 'France', '254': 'French Guiana', '258': 'French Polynesia', '583': 'FS Micronesia', '266': 'Gabon',
'270': 'Gambia', '268': 'Georgia', '276': 'Germany', '288': 'Ghana', '292': 'Gibraltar',
'300': 'Greece', '304': 'Greenland', '308': 'Grenada', '312': 'Guadeloupe', '320': 'Guatemala',
'324': 'Guinea', '624': 'Guinea-Bissau', '328': 'Guyana', '332': 'Haiti',
'336': 'Holy See (Vatican City State)',
'340': 'Honduras', '348': 'Hungary', '352': 'Iceland', '699': 'India', '364': 'Iran', '368': 'Iraq',
'372': 'Ireland', '376': 'Israel', '381': 'Italy', '388': 'Jamaica', '392': 'Japan',
'400': 'Jordan', '398': 'Kazakhstan', '404': 'Kenya', '296': 'Kiribati', '414': 'Kuwait',
'417': 'Kyrgyzstan', '418': "Lao People's Dem. Rep.", '428': 'Latvia', '422': 'Lebanon', '426': 'Lesotho',
'430': 'Liberia', '434': 'Libya', '440': 'Lithuania', '442': 'Luxembourg', '450': 'Madagascar',
'454': 'Malawi', '458': 'Malaysia', '462': 'Maldives', '466': 'Mali', '470': 'Malta',
'584': 'Marshall Isds',
'474': 'Martinique', '478': 'Mauritania', '480': 'Mauritius', '175': 'Mayotte', '484': 'Mexico',
'496': 'Mongolia', '499': 'Montenegro', '500': 'Montserrat', '504': 'Morocco', '508': 'Mozambique',
'104': 'Myanmar',
'580': 'N. Mariana Isds', '516': 'Namibia', '524': 'Nepal', '530': 'Neth. Antilles',
'532': 'Neth. Antilles and Aruba', '528': 'Netherlands', '540': 'New Caledonia', '554': 'New Zealand',
'558': 'Nicaragua',
'562': 'Niger', '566': 'Nigeria', '579': 'Norway', '512': 'Oman', '586': 'Pakistan', '585': 'Palau',
'591': 'Panama', '598': 'Papua New Guinea', '600': 'Paraguay', '459': 'Peninsula Malaysia', '604': 'Peru',
'608': 'Philippines',
'616': 'Poland', '620': 'Portugal', '634': 'Qatar', '410': 'Rep. of Korea', '498': 'Rep. of Moldova',
'638': 'Réunion', '642': 'Romania', '643': 'Russian Federation', '646': 'Rwanda', '647': 'Ryukyu Isd',
'461': 'Sabah',
'652': 'Saint Barthelemy', '654': 'Saint Helena', '659': 'Saint Kitts and Nevis', '662': 'Saint Lucia',
'534': 'Saint Maarten', '666': 'Saint Pierre and Miquelon', '670': 'Saint Vincent and the Grenadines',
'882': 'Samoa', '674': 'San Marino', '678': 'Sao Tome and Principe', '457': 'Sarawak',
'682': 'Saudi Arabia', '686': 'Senegal', '688': 'Serbia', '690': 'Seychelles', '694': 'Sierra Leone',
'702': 'Singapore',
'703': 'Slovakia', '705': 'Slovenia', '90': 'Solomon Isds', '706': 'Somalia', '710': 'South Africa',
'728': 'South Sudan', '724': 'Spain', '144': 'Sri Lanka', '275': 'State of Palestine',
'729': 'Sudan', '740': 'Suriname', '748': 'Eswatini', '752': 'Sweden', '757': 'Switzerland',
'760': 'Syria', '762': 'Tajikistan', '807': 'North Macedonia', '764': 'Thailand', '626': 'Timor-Leste',
'768': 'Togo', '772': 'Tokelau', '776': 'Tonga', '780': 'Trinidad and Tobago', '788': 'Tunisia',
'795': 'Turkmenistan', '796': 'Turks and Caicos Isds', '798': 'Tuvalu', '800': 'Uganda',
'804': 'Ukraine', '784': 'United Arab Emirates', '826': 'United Kingdom', '834': 'United Rep. of Tanzania',
'858': 'Uruguay', '850': 'US Virgin Isds', '842': 'USA', '860': 'Uzbekistan',
'548': 'Vanuatu', '862': 'Venezuela', '704': 'Viet Nam', '876': 'Wallis and Futuna Isds', '887': 'Yemen',
'894': 'Zambia', '716': 'Zimbabwe'}
# 使用上述封装后的get_data_un_comtrade()取数据实例
# proxyHost、proxyPort、proxyUser、proxyPass、user_agents需根据自己使用的代理来进行设置
proxyHost =
proxyPort =
proxyUser =
proxyPass =
user_agents = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52"]
# 初始化proxy实例
proxy_use = proxy(proxyHost, proxyPort, proxyUser, proxyPass, user_agents)
class DownloadData(threading.Thread):
all_data = None
def __init__(self, country_code, year,good):
super().__init__()
self.country_code = country_code
self.year = str(year)
self.good = good
def run(self):
with semaphore:
print(countries[self.country_code] + " BEGINS! TIME:", datetime.datetime.now())
temp_import = get_data_un_comtrade(max_un=100000, r=self.country_code, freq='A', ps=self.year, px='S1',
p='all', rg='1', cc=self.good, fmt='csv', type_un='C', ifuse_proxy=True,
proxy=proxy_use)
temp_name_import = list(temp_import.keys())[0]
# temp_export = get_data_un_comtrade(max_un=100000, r=self.country_code, freq='A', ps='2021', px='HS',
# p='all', rg='2', cc='TOTAL', fmt='csv', type_un='C', ifuse_proxy=True,
# proxy=proxy_use)
# temp_name_export = list(temp_export.keys())[0]
# if ((temp_import[temp_name_import] is not None) or (temp_export[temp_name_export] is not None)):
# a=temp_import[temp_name_import]
# print(type(a))
# print(a)
# print(a[1])
# 创建一个年份的路径文件夹
zongdizhi = "./uncomtrade_data/{}/{}/"
jutidizhi = zongdizhi.format(self.good,self.year)
if os.path.exists(jutidizhi) == False:
os.makedirs(jutidizhi)
# 把爬下来的数据储存进对应文件夹
temp_data = temp_import[temp_name_import]
if str(type(temp_data)) != "<class 'NoneType'>":
if (not temp_data.empty):
temp_data.to_excel(
jutidizhi + countries[self.country_code] + ".xlsx")
print("DATA NAME IS " + temp_name_import + ". COMPLETED! ")
else:
print(temp_name_import + " is None! SKIP!")
print(temp_import)
else:
print(temp_name_import + " is NoneType! SKIP!")
print(temp_import)
# print(temp_name_import + " or " + temp_name_export + " is None! SKIP!")
return
thread_list = [] # 定义一个列表,向里面追加线程
MAX_THREAD_NUM = 30 # 最大线程数
semaphore = threading.BoundedSemaphore(MAX_THREAD_NUM) # 或使用Semaphore方法
goods=['total','0111','0221','0222','041','042','044','0611','2214']
for good in goods:
for year in range(1962, 2017):
for i, country_code in zip(list(range(len(countries.keys()))), list(countries.keys())):
m = DownloadData(country_code, year, good)
thread_list.append(m)
for m in thread_list:
m.start() # 调用start()方法,开始执行
for m in thread_list:
m.join() # 子线程调用join()方法,使主线程等待子线程运行完毕之后才退出