UN Comtrade python爬虫实现,多线程动态ip

原帖见【爬虫】Python使用动态IP,多线程,爬取uncomtrade的数据_学金融的程序员懒羊羊的博客-CSDN博客_爬虫动态ip

原帖url以及自定义函数代码有小问题,本贴修改了url代码,自定义函数的递归问题,跑通代码。

需要自己去ip代理平台购买IP代理。

# coding=gbk
import requests
import pandas as pd
from random import randint
import datetime
from io import StringIO
import threading
import os


class proxy:
    proxyHost = 
    proxyPort = 
    proxyUser = 
    proxyPass = 
    user_agents = [
                   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.0 Safari/537.36",
                   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.0 Safari/537.36",
                   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
                   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.0 Safari/537.36",
                   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.0 Safari/537.36",
                   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.0 Safari/537.36",
                   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.0 Safari/537.36",
                   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.0 Safari/537.36",
                   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.0 Safari/537.36",
                   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.0 Safari/537.36",
                   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.0 Safari/537.36",
                   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.0 Safari/537.36",
                   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.0 Safari/537.36",
                   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.0 Safari/537.36",
                   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.0 Safari/537.36",
                  ]
    proxies = {}  

    def __init__(self, proxyHost, proxyPort, proxyUser, proxyPass, user_agents):
        self.proxyHost = proxyHost
        self.proxyPort = proxyPort
        self.proxyUser = proxyUser
        self.proxyPass = proxyPass
        self.user_agents = user_agents
        proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
            "host": proxyHost,
            "port": proxyPort,
            "user": proxyUser,
            "pass": proxyPass,
        }
        self.proxies = {
            "http": proxyMeta,
            "https": proxyMeta,
        }


# 错误信息写到csv里,爬到的数据是返回(保存在内存里)
def download_url(url, ifuse_proxy=False, proxy=None):

    if (ifuse_proxy):
        random_agent = proxy.user_agents[
            randint(0, len(proxy.user_agents) - 1)]  # chose an user agent from the user agent list above
        tunnel = randint(1, 10000)  # generate a tunnel   动态tunnel?
        header = {
            "Proxy-Tunnel": str(tunnel),
            "User-Agent": random_agent
        }
    # print(header,proxy.proxies)

    try:
        if (ifuse_proxy):
            content = requests.get(url, timeout=100, headers=header, proxies=proxy.proxies)
        else:
            content = requests.get(url, timeout=100, proxies=proxy)
        ''' note that sometimes we only get error informations in the responses, and here are some really dumb quick fixes'''
        if (
                content.text == "<html><body><h1>502 Bad Gateway</h1>\nThe server returned an invalid or incomplete response.\n</body></html>\n"
                or content.text == "Too Many Requests.\n" or content.text == "{\"Message\":\"An error has occurred.\"}"
                or ' <h2>403 - Forbidden: Access is denied.</h2>' in content.text
                or "Proxy Bad Server" in content.text):
            with open("./uncomtrade_data/serverError.csv", 'a', encoding="utf-8") as log:
                log.write(str(datetime.datetime.now()) + "," + str(url) + "\n")
                # print("\n" + content.content.decode())
                print(str(url) + ' 403 forbidden, retrying')
                if (ifuse_proxy):
                    return download_url(url, ifuse_proxy=True, proxy=proxy)
                else:
                    return download_url(url, ifuse_proxy=False, proxy=None)
        elif (
                'No data matches your query or your query is too complex. Request JSON or XML format for more information.' in content.text):
            print(str(url) + ' no data')
        else:
            # if ('json' in url):
            #     return json_normalize(content.json()['dataset'])
            # elif ('csv' in url):
            dataframe = pd.read_csv(StringIO(content.text), on_bad_lines='skip')  
            # print(content.text)
            return dataframe

    except requests.RequestException as e:
        print(type(e).__name__ + " has occurred, change proxy!")
        #         if(type(e).__name__=='JSONDecodeError'):
        #             print(content.content)
        with open("./uncomtrade_data/exp.csv", 'a', encoding="utf-8") as log:
            log.write(
                str(datetime.datetime.now()) + "," + str(type(e).__name__) + "," + str(url) + "\n")
        if (ifuse_proxy):
            return download_url(url, ifuse_proxy=True, proxy=proxy)
        else:
            return download_url(url, ifuse_proxy=False, proxy=None)


def get_data_un_comtrade(max_un=100000, r='156', freq='A', ps='2021', px='S1', p='all', rg='1', cc='TOTAL', fmt='csv',
                         type_un='C', ifuse_proxy=False, proxy=None):
    '''
    max:最大返回数据量(默认为100000);
    r:reportering area,选择所需要的目标国家;
    freq:选择数据为年度或月度(A,M);
    ps:选择所需要的年份;
    px:选择分类标准,如常用的SITC Revision 3为S3;
    p:partner area,选择所需要的对象国家,如需要中国与俄罗斯的出口额,则目标为中国,对象为俄罗斯;
    rg:选择进口或出口(进口为1,出口为2);
    cc:选择产品代码;
    fmt:选择输出文件格式,csv或json,默认使用json(实测中csv更快);
    type:选择贸易类型,产品或服务;
    ifuse_proxy:是否使用代理;
    proxy:代理信息。

    return:{数据名称: 数据}{str:dataframe}
    '''
    pre_url = "http://comtrade.un.org/api/get?max={}&type={}&freq={}&px={}&ps={}&r={}&p={}&rg={}&cc={}&fmt={}"
    url_use = pre_url.format(max_un, type_un, freq, px, ps, r, p, rg, cc, fmt)
    print("Getting data from:" + url_use)
    data = download_url(url_use, ifuse_proxy=ifuse_proxy, proxy=proxy)
    if (rg == 1):
        ex_or_in = 'IMPORT'
    else:
        ex_or_in = 'EXPORT'
    data_name = ps + "_" + r + "_" + p + "_" + px + "_" + cc + "_" + ex_or_in + "_" + freq
    return {data_name: data}


countries = {'156': 'China', '344': 'China, Hong Kong SAR', '446': 'China, Macao SAR',
             '4': 'Afghanistan', '8': 'Albania', '12': 'Algeria', '20': 'Andorra', '24': 'Angola', '660': 'Anguilla',
             '28': 'Antigua and Barbuda', '32': 'Argentina', '51': 'Armenia',
             '533': 'Aruba', '36': 'Australia', '40': 'Austria', '31': 'Azerbaijan', '44': 'Bahamas', '48': 'Bahrain',
             '50': 'Bangladesh', '52': 'Barbados',
             '112': 'Belarus', '56': 'Belgium', '58': 'Belgium-Luxembourg', '84': 'Belize', '204': 'Benin',
             '60': 'Bermuda', '64': 'Bhutan', '68': 'Bolivia (Plurinational State of)', '535': 'Bonaire',
             '70': 'Bosnia Herzegovina', '72': 'Botswana', '92': 'Br. Virgin Isds', '76': 'Brazil',
             '96': 'Brunei Darussalam', '100': 'Bulgaria', '854': 'Burkina Faso', '108': 'Burundi', '132': 'Cabo Verde',
             '116': 'Cambodia',
             '120': 'Cameroon', '124': 'Canada', '136': 'Cayman Isds', '140': 'Central African Rep.', '148': 'Chad',
             '152': 'Chile',
             '170': 'Colombia', '174': 'Comoros', '178': 'Congo', '184': 'Cook Isds', '188': 'Costa Rica',
             '384': "Cote d'Ivoire", '191': 'Croatia', '192': 'Cuba', '531': 'Curaao', '196': 'Cyprus',
             '203': 'Czechia',
             '200': 'Czechoslovakia', '408': "Dem. People's Rep. of Korea", '180': 'Dem. Rep. of the Congo',
             '208': 'Denmark', '262': 'Djibouti', '212': 'Dominica', '214': 'Dominican Rep.', '218': 'Ecuador',
             '818': 'Egypt', '222': 'El Salvador', '226': 'Equatorial Guinea', '232': 'Eritrea', '233': 'Estonia',
             '231': 'Ethiopia', '234': 'Faeroe Isds', '238': 'Falkland Isds (Malvinas)', '242': 'Fiji',
             '246': 'Finland',
             '251': 'France', '254': 'French Guiana', '258': 'French Polynesia', '583': 'FS Micronesia', '266': 'Gabon',
             '270': 'Gambia', '268': 'Georgia', '276': 'Germany', '288': 'Ghana', '292': 'Gibraltar',
             '300': 'Greece', '304': 'Greenland', '308': 'Grenada', '312': 'Guadeloupe', '320': 'Guatemala',
             '324': 'Guinea', '624': 'Guinea-Bissau', '328': 'Guyana', '332': 'Haiti',
             '336': 'Holy See (Vatican City State)',
             '340': 'Honduras', '348': 'Hungary', '352': 'Iceland', '699': 'India', '364': 'Iran', '368': 'Iraq',
             '372': 'Ireland', '376': 'Israel', '381': 'Italy', '388': 'Jamaica', '392': 'Japan',
             '400': 'Jordan', '398': 'Kazakhstan', '404': 'Kenya', '296': 'Kiribati', '414': 'Kuwait',
             '417': 'Kyrgyzstan', '418': "Lao People's Dem. Rep.", '428': 'Latvia', '422': 'Lebanon', '426': 'Lesotho',
             '430': 'Liberia', '434': 'Libya', '440': 'Lithuania', '442': 'Luxembourg', '450': 'Madagascar',
             '454': 'Malawi', '458': 'Malaysia', '462': 'Maldives', '466': 'Mali', '470': 'Malta',
             '584': 'Marshall Isds',
             '474': 'Martinique', '478': 'Mauritania', '480': 'Mauritius', '175': 'Mayotte', '484': 'Mexico',
             '496': 'Mongolia', '499': 'Montenegro', '500': 'Montserrat', '504': 'Morocco', '508': 'Mozambique',
             '104': 'Myanmar',
             '580': 'N. Mariana Isds', '516': 'Namibia', '524': 'Nepal', '530': 'Neth. Antilles',
             '532': 'Neth. Antilles and Aruba', '528': 'Netherlands', '540': 'New Caledonia', '554': 'New Zealand',
             '558': 'Nicaragua',
             '562': 'Niger', '566': 'Nigeria', '579': 'Norway', '512': 'Oman', '586': 'Pakistan', '585': 'Palau',
             '591': 'Panama', '598': 'Papua New Guinea', '600': 'Paraguay', '459': 'Peninsula Malaysia', '604': 'Peru',
             '608': 'Philippines',
             '616': 'Poland', '620': 'Portugal', '634': 'Qatar', '410': 'Rep. of Korea', '498': 'Rep. of Moldova',
             '638': 'Réunion', '642': 'Romania', '643': 'Russian Federation', '646': 'Rwanda', '647': 'Ryukyu Isd',
             '461': 'Sabah',
             '652': 'Saint Barthelemy', '654': 'Saint Helena', '659': 'Saint Kitts and Nevis', '662': 'Saint Lucia',
             '534': 'Saint Maarten', '666': 'Saint Pierre and Miquelon', '670': 'Saint Vincent and the Grenadines',
             '882': 'Samoa', '674': 'San Marino', '678': 'Sao Tome and Principe', '457': 'Sarawak',
             '682': 'Saudi Arabia', '686': 'Senegal', '688': 'Serbia', '690': 'Seychelles', '694': 'Sierra Leone',
             '702': 'Singapore',
             '703': 'Slovakia', '705': 'Slovenia', '90': 'Solomon Isds', '706': 'Somalia', '710': 'South Africa',
             '728': 'South Sudan', '724': 'Spain', '144': 'Sri Lanka', '275': 'State of Palestine',
             '729': 'Sudan', '740': 'Suriname', '748': 'Eswatini', '752': 'Sweden', '757': 'Switzerland',
             '760': 'Syria', '762': 'Tajikistan', '807': 'North Macedonia', '764': 'Thailand', '626': 'Timor-Leste',
             '768': 'Togo', '772': 'Tokelau', '776': 'Tonga', '780': 'Trinidad and Tobago', '788': 'Tunisia',
             '795': 'Turkmenistan', '796': 'Turks and Caicos Isds', '798': 'Tuvalu', '800': 'Uganda',
             '804': 'Ukraine', '784': 'United Arab Emirates', '826': 'United Kingdom', '834': 'United Rep. of Tanzania',
             '858': 'Uruguay', '850': 'US Virgin Isds', '842': 'USA', '860': 'Uzbekistan',
             '548': 'Vanuatu', '862': 'Venezuela', '704': 'Viet Nam', '876': 'Wallis and Futuna Isds', '887': 'Yemen',
             '894': 'Zambia', '716': 'Zimbabwe'}

# 使用上述封装后的get_data_un_comtrade()取数据实例
# proxyHost、proxyPort、proxyUser、proxyPass、user_agents需根据自己使用的代理来进行设置
proxyHost = 
proxyPort = 
proxyUser = 
proxyPass = 
user_agents = [
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
    "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
    "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
    "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
    "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
    "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
    "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
    "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52"]
# 初始化proxy实例
proxy_use = proxy(proxyHost, proxyPort, proxyUser, proxyPass, user_agents)


class DownloadData(threading.Thread):
    all_data = None

    def __init__(self, country_code, year,good):
        super().__init__()
        self.country_code = country_code
        self.year = str(year)
        self.good = good

    def run(self):
        with semaphore:
            print(countries[self.country_code] + " BEGINS! TIME:", datetime.datetime.now())
            temp_import = get_data_un_comtrade(max_un=100000, r=self.country_code, freq='A', ps=self.year, px='S1',
                                               p='all', rg='1', cc=self.good, fmt='csv', type_un='C', ifuse_proxy=True,
                                               proxy=proxy_use)
            temp_name_import = list(temp_import.keys())[0]
            # temp_export = get_data_un_comtrade(max_un=100000, r=self.country_code, freq='A', ps='2021', px='HS',
            #                                    p='all', rg='2', cc='TOTAL', fmt='csv', type_un='C', ifuse_proxy=True,
            #                                    proxy=proxy_use)
            # temp_name_export = list(temp_export.keys())[0]
            # if ((temp_import[temp_name_import] is not None) or (temp_export[temp_name_export] is not None)):
            # a=temp_import[temp_name_import]
            # print(type(a))
            # print(a)
            # print(a[1])

            # 创建一个年份的路径文件夹
            zongdizhi = "./uncomtrade_data/{}/{}/"
            jutidizhi = zongdizhi.format(self.good,self.year)
            if os.path.exists(jutidizhi) == False:
                os.makedirs(jutidizhi)

            # 把爬下来的数据储存进对应文件夹
            temp_data = temp_import[temp_name_import]
            if str(type(temp_data)) != "<class 'NoneType'>":
                if (not temp_data.empty):
                    temp_data.to_excel(
                        jutidizhi + countries[self.country_code] + ".xlsx")
                    print("DATA NAME IS " + temp_name_import + ". COMPLETED!  ")
                else:
                    print(temp_name_import + " is None! SKIP!")
                    print(temp_import)
            else:
                print(temp_name_import + " is NoneType! SKIP!")
                print(temp_import)
            #     print(temp_name_import + " or " + temp_name_export + " is None! SKIP!")
        return


thread_list = []  # 定义一个列表,向里面追加线程
MAX_THREAD_NUM = 30  # 最大线程数
semaphore = threading.BoundedSemaphore(MAX_THREAD_NUM)  # 或使用Semaphore方法
goods=['total','0111','0221','0222','041','042','044','0611','2214']
for good in goods:
    for year in range(1962, 2017):
        for i, country_code in zip(list(range(len(countries.keys()))), list(countries.keys())):  
            m = DownloadData(country_code, year, good)
            thread_list.append(m)

for m in thread_list:
    m.start()  # 调用start()方法,开始执行

for m in thread_list:
    m.join()  # 子线程调用join()方法,使主线程等待子线程运行完毕之后才退出
  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值