下载表格

# -*- coding: utf-8 -*-

import csv
import time
import threading
from bs4 import BeautifulSoup as bs
import pandas as pd
import requests


#国家字典
country_dict={'AFGHANISTAN TIS': '1', 'ALBANIA': '3', 'ALGERIA': '5', 'AMERI SAMOA': '7', 'ANDORRA': '9', 'ANGOLA': '11', 'ANGUILLA': '12', 'ANTARTICA': '14', 'ANTIGUA': '13', 'ARGENTINA': '15', 'ARMENIA': '16', 'ARUBA': '20', 'AUSTRALIA': '17', 'AUSTRIA': '19', 'AZERBAIJAN': '21', 'BAHAMAS': '23', 'BAHARAIN IS': '25', 'BANGLADESH PR': '27', 'BARBADOS': '29', 'BELARUS': '55', 'BELGIUM': '33', 'BELIZE': '31', 'BENIN': '35', 'BERMUDA': '37', 'BHUTAN': '38', 'BOLIVIA': '39', 'BOSNIA-HRZGOVIN': '40', 'BOTSWANA': '41', 'BR VIRGN IS': '45', 'BRAZIL': '43', 'BRUNEI': '47', 'BULGARIA': '49', 'BURKINA FASO': '50', 'BURUNDI': '53', 'C AFRI REP': '67', 'CAMBODIA': '56', 'CAMEROON': '57', 'CANADA': '59', 'CANARY IS': '61', 'CAPE VERDE IS': '63', 'CAYMAN IS': '65', 'CHAD': '69', 'CHANNEL IS': '71', 'CHILE': '73', 'CHINA P RP': '77', 'CHRISTMAS IS.': '79', 'COCOS IS': '81', 'COLOMBIA': '83', 'COMOROS': '85', 'CONGO D. REP.': '459', 'CONGO P REP': '87', 'COOK IS': '89', 'COSTA RICA': '91', "COTE D' IVOIRE": '199', 'CROATIA': '92', 'CUBA': '93',
              'CURACAO': '276', 'CYPRUS': '95', 'CZECH REPUBLIC': '98', 'DENMARK': '101', 'DJIBOUTI': '102', 'DOMINIC REP': '103', 'DOMINICA': '105', 'ECUADOR': '109', 'EGYPT A RP': '111', 'EL SALVADOR': '113', 'EQUTL GUINEA': '117', 'ERITREA': '116', 'ESTONIA': '114', 'ETHIOPIA': '115', 'FALKLAND IS': '123', 'FAROE IS.': '121', 'FIJI IS': '127', 'FINLAND': '125', 'FR GUIANA': '131', 'FR POLYNESIA': '133', 'FR S ANT TR': '135', 'FRANCE': '129', 'GABON': '141', 'GAMBIA': '143', 'GEORGIA': '145', 'GERMANY': '147', 'GHANA': '149', 'GIBRALTAR': '151', 'GREECE': '155', 'GREENLAND': '157', 'GRENADA': '159', 'GUADELOUPE': '161', 'GUAM': '163', 'GUATEMALA': '165', 'GUERNSEY': '124', 'GUINEA': '167', 'GUINEA BISSAU': '169', 'GUYANA': '171', 'HAITI': '175', 'HEARD MACDONALD': '176', 'HONDURAS': '177', 'HONG KONG': '179', 'HUNGARY': '181', 'ICELAND': '185', 'INDONESIA': '187', 'INSTALLATIONS IN INTERNATIONAL WATERS': '2', 'IRAN': '189', 'IRAQ': '191', 'IRELAND': '193', 'ISRAEL': '195', 'ITALY': '197', 'JAMAICA': '203', 'JAPAN': '205', 'JERSEY': '122',
              'JORDAN': '207', 'KAZAKHSTAN': '212', 'KENYA': '213', 'KIRIBATI REP': '214', 'KOREA DP RP': '215', 'KOREA RP': '217', 'KUWAIT': '219', 'KYRGHYZSTAN': '216', 'LAO PD RP': '223', 'LATVIA': '224', 'LEBANON': '225', 'LESOTHO': '227', 'LIBERIA': '229', 'LIBYA': '231', 'LIECHTENSTEIN': '233', 'LITHUANIA': '234', 'LUXEMBOURG': '235', 'MACAO': '239', 'MACEDONIA': '240', 'MADAGASCAR': '241', 'MALAWI': '243', 'MALAYSIA': '245', 'MALDIVES': '247', 'MALI': '249', 'MALTA': '251', 'MARSHALL ISLAND': '252', 'MARTINIQUE': '253', 'MAURITANIA': '255', 'MAURITIUS': '257', 'MAYOTTE': '34', 'MEXICO': '259', 'MICRONESIA': '256', 'MOLDOVA': '260', 'MONACO': '262', 'MONGOLIA': '261', 'MONTENEGRO': '356', 'MONTSERRAT': '263', 'MOROCCO': '265', 'MOZAMBIQUE': '267', 'MYANMAR': '258', 'N. MARIANA IS.': '294', 'NAMIBIA': '269', 'NAURU RP': '271', 'NEPAL': '273', 'NETHERLAND': '275', 'NETHERLANDANTIL': '277', 'NEUTRAL ZONE': '279', 'NEW CALEDONIA': '281', 'NEW ZEALAND': '285', 'NICARAGUA': '287', 'NIGER': '289', 'NIGERIA': '291', 'NIUE IS': '293', 'NORFOLK IS': '295',
              'NORWAY': '297', 'OMAN': '301', 'PACIFIC IS': '307', 'PAKISTAN IR': '309', 'PALAU': '310', 'PANAMA C Z': '313', 'PANAMA REPUBLIC': '311', 'PAPUA N GNA': '315', 'PARAGUAY': '317', 'PERU': '319', 'Petroleum Products': '0', 'PHILIPPINES': '323', 'PITCAIRN IS.': '321', 'POLAND': '325', 'PORTUGAL': '327', 'PUERTO RICO': '331', 'QATAR': '335', 'REUNION': '339', 'ROMANIA': '343', 'RUSSIA': '344', 'RWANDA': '345', 'SAHARWI A.DM RP': '347', 'SAMOA': '447', 'SAN MARINO': '346', 'SAO TOME': '349', 'SAUDI ARAB': '351', 'SENEGAL': '353', 'SERBIA': '352', 'SEYCHELLES': '355', 'SIERRA LEONE': '357', 'SINGAPORE': '359', 'SINT MAARTEN (DUTCH PART)': '278', 'SLOVAK REP': '358', 'SLOVENIA': '360', 'SOLOMON IS': '361', 'SOMALIA': '363', 'SOUTH AFRICA': '365', 'SOUTH SUDAN': '382', 'SPAIN': '367', 'SRI LANKA DSR': '369', 'ST HELENA': '371', 'ST KITT N A': '373', 'ST LUCIA': '375', 'ST PIERRE': '377', 'ST VINCENT': '379', 'STATE OF PALEST': '196', 'SUDAN': '381', 'SURINAME': '383', 'SWAZILAND': '385', 'SWEDEN': '387', 'SWITZERLAND': '389', 'SYRIA': '391', 'TAIWAN': '75',
              'TAJIKISTAN': '393', 'TANZANIA REP': '395', 'THAILAND': '397', 'TIMOR LESTE': '329', 'TOGO': '399', 'TOKELAU IS': '401', 'TONGA': '403', 'Trade to Unspecified Countries': '999', 'TRINIDAD': '405', 'TUNISIA': '407', 'TURKEY': '409', 'TURKMENISTAN': '410', 'TURKS C IS': '411', 'TUVALU': '413', 'U ARAB EMTS': '419', 'U K': '421', 'U S A': '423', 'UGANDA': '417', 'UKRAINE': '422', 'UNION OF SERBIA & MONTENEGRO': '354', 'UNSPECIFIED': '599', 'URUGUAY': '427', 'US MINOR OUTLYING ISLANDS': '424', 'UZBEKISTAN': '430', 'VANUATU REP': '431', 'VATICAN CITY': '198', 'VENEZUELA': '433', 'VIETNAM SOC REP': '437', 'VIRGIN IS US': '439', 'WALLIS F IS': '443', 'YEMEN REPUBLC': '453', 'ZAMBIA': '461', 'ZIMBABWE': '463'}

country_new=dict(zip(country_dict.values(),country_dict.keys()))

#获取响应
def get_html(url,month,year,country_code,rad,count=1):
    print('爬取的国家========'+country_code+'-'+year+'-'+str(month))
    print('爬取的次数是======='+str(count))
    header = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"}
    formdata = {
        "radioCY": "1",                  # 第一行:固定值'1'
        "Mm1": month,                      # 月份:可变值;选12个月,字符串形式
        "yy1": year,                   # 年份:可变值;字符串形式
        "cntcode": country_code,                # 国家编码:可变值;字符串形式
        "hslevel": "8",                  # hs编码:固定值,8位码
        "sort": "0",                     # 编码类型:固定值,HS CODE
        "radioDAll": "1",                # 倒数第二行:固定值,选1
        rad: "1"                      # 美元或质量:美元--"radiousd": "1";质量--"radioqty": "1"
    }
    if count>10:
        with open('fail_file.txt','a+') as fail_file:
            fail_file.write(url[-1]+'--'+country_code+'--'+rad+'--'+year+'--'+str(month) + '\n')
        fail_file.close()
        return None

    try:
        response=requests.post(url,headers=header,data=formdata,timeout=5)
        return response.text
    except:
        count+=1
        return get_html(url,month,year,country_code,rad,count)


# 获取到响应表格并做相应的处理(获取value总和,quantity总和,给出的total值,统计没有数据的文件,保存csv)
def pandas_deal(original_url,original_year,html_data, countryname, month):
    soup = bs(html_data, 'lxml')
    tables = soup.select('table')
    if tables == []:  # 此处需要处理一下,把相应文件名保存到一个文件
        with open('nodata_file.txt','a+') as nodata_file:
            nodata_file.write(original_url[-1]+'--'+countryname + '--' + original_year+'--'+str(month) + '\n')
        nodata_file.close()
        print('该文件没有数据......')
        return None
    else:
        df_list = []
        for table in tables:
            df_list.append(pd.concat(pd.read_html(table.prettify())))
        df = pd.concat(df_list)
        return df



# 美元表和质量表合并,并形成目标表
def df_final(df, df_usd, countryname):
    # 处理质量表,把第一行去掉
    df = df.rename(index=str,columns={0: df.iat[0, 0], 1: df.iat[0, 1], 2: df.iat[0, 2], 3: df.iat[0, 3], 4: df.iat[0, 4],5: df.iat[0, 5], 6: df.iat[0, 6], 7: df.iat[0, 7], 8: df.iat[0, 8], 9: df.iat[0, 9]})

    df = df[~df["S.No."].isin(["S.No."])]
    df = df.reset_index(drop=True)  # 处理好的质量表
    # 美元表和质量表合并
    df_all = df_usd[['S.No.', 'HSCode', df_usd.columns[3], df_usd.columns[4]]]
    df_all['COUNTRY'] = countryname  # 注意,此处要调整为变量
    df_all['HSCode-数量表'] = df['HSCode']
    df_all[df.columns[4] + '数量'] = df[df.columns[4]]
    df_all[df.columns[5] + '数量'] = df[df.columns[5]]
    df_all["Unit (1000‘)"] = df["Unit"]
    df_all = df_all.rename(index=str,columns={"HSCode": "HSCode-金额表编码", df_usd.columns[3]: df_usd.columns[3] + "-金额百万美元",df_usd.columns[4]: df_usd.columns[4] + "-金额百万美元"})

    df_all = df_all[["S.No.", "COUNTRY", "HSCode-金额表编码", "HSCode-数量表", df_usd.columns[3] + "-金额百万美元", df_usd.columns[4] + "-金额百万美元",df.columns[4] + "数量", df.columns[5] + "数量", "Unit (1000‘)"]]

    return df_all



#把统计数据写入文件
def compare_file(a,b,c,d,e,f):
    with open('compare_num.txt','a+') as compare_file:
        compare_file.write(a + '\n')
        compare_file.write(b + '\n')
        compare_file.write(c + '\n')
        compare_file.write(d + '\n')
        compare_file.write(e + '\n')
        compare_file.write(f + '\n')
    compare_file.close()


# 根据合并的目标表统计数据,并保存到一个文件中
def df_final_count(dffinal, year, month, countryname,reptilr_url):
    print('要保存的数据看看================')
    print(dffinal.head())

    total_us_old = reptilr_url[-1:]+'--'+countryname + '--' + str(int(year) - 1) + '--' + str(month) + '--' + 'total==' + str(dffinal.iat[-1, 4])
    print(total_us_old)
    total_qty_new = reptilr_url[-1:]+'--'+countryname + '--' + year + '--' + str(month) + '--' + 'total==' + str(dffinal.iat[-1, 5])
    print(total_qty_new)
    us_new = reptilr_url[-1:]+'--'+countryname + '--' + year + '--' + str(month) + '--' + 'us==' + str(dffinal[dffinal.columns[5]].sum() - dffinal.iat[-1, 4])  # us的总和,新年份
    print(us_new)
    us_old = reptilr_url[-1:]+'--'+countryname + '--' + str(int(year) - 1) + '--' + str(month) + '--' + 'us==' + str(dffinal[dffinal.columns[4]].sum() - dffinal.iat[-1, 5])  # us的总和,旧年份
    print(us_old)
    print(type(us_old))
    dffinal[dffinal.columns[6]] = dffinal[dffinal.columns[6]].astype('float')
    dffinal[dffinal.columns[7]] = dffinal[dffinal.columns[7]].astype('float')
    qty_new = reptilr_url[-1:]+'--'+countryname + '--' + year + '--' + str(month) + '--' + 'quantity==' + str(dffinal[dffinal.columns[7]].sum())  # 质量的总和,新年份
    print(qty_new)
    qty_old = reptilr_url[-1:]+'--'+countryname + '--' + str(int(year) - 1) + '--' + str(month) + '--' + 'quantity==' + str(dffinal[dffinal.columns[6]].sum())  # 质量的总和,旧年份
    print(qty_old)

    print('开始把数据写入文件!!!!!!!!!!!!')
    compare_file(total_us_old, total_qty_new, us_new, us_old, qty_new, qty_old)
    print('写完文件!!!!!!!!!!!!')


#写一个爬取和处理结合的函数,用来执行多线程
def reptile_deal(url, month, year, country_code, country_name,rad):
    html=get_html(url, month, year, country_code, rad)
    print('获取请求成功!!!!!!!!!!!!!!!!!!!!')
    if html:
        df = pandas_deal(url, year, html, country_name, month)
        if df is not None:
            if rad == "radioqty":
                try:
                    if url == 'http://commerce-app.gov.in/meidb/cntcom.asp?ie=e':
                        df_usd = pd.read_csv("india/exports/" + year + "/" + url[-1]+"--"+country_name + "--" +year+"--"+ str(month) + ".csv",skiprows=1, encoding="gb2312")
                        dffinal = df_final(df, df_usd, country_name)
                        dffinal.to_csv("india/exports/" + year + "/" + url[-1]+"--"+country_name + "--" +year+"--"+ str(month) + "--final.csv",index=False, encoding='gb2312')
                        df_final_count(dffinal, year, month, country_name, url)
                    else:
                        df_usd = pd.read_csv("india/exports/" + year + "/" + url[-1]+"--"+country_name + "--" +year+"--"+ str(month) + ".csv",skiprows=1, encoding="gb2312")
                        dffinal = df_final(df, df_usd, country_name)
                        dffinal.to_csv("india/exports/" + year + "/" +url[-1]+"--"+ country_name + "-" +year+"--"+ str(month) + "--final.csv",index=False, encoding='gb2312')
                        df_final_count(dffinal, year, month, country_name, url)
                except:
                    with open('fail_file.txt', 'a+') as fail_file:
                        fail_file.write('usd_fail--'+url[-1] + '--' + country_code + '--' + rad + '--' + year + '--' + str(month) + '\n')
                    fail_file.close()
                    pass

            else:
                if url == 'http://commerce-app.gov.in/meidb/cntcom.asp?ie=e':
                    df.to_csv('india/exports/' + year + '/' + url[-1]+"--"+country_name + '--' + year+"--"+str(month) + ".csv", index=False,encoding='gb2312')
                else:
                    df.to_csv('india/imports/' + year + '/' + url[-1]+"--"+country_name + '--' +year +"--"+str(month) + ".csv", index=False,encoding='gb2312')



def main():
    url_exports = 'http://commerce-app.gov.in/meidb/cntcom.asp?ie=e'
    url_imports = 'http://commerce-app.gov.in/meidb/cntcom.asp?ie=i'
    imex=[url_imports]
    year_list = ['2013']
    for uu in imex:
        count=0
        for yy in year_list:
            for rr in ["radiousd","radioqty"]:
                # for rr in ["radioqty","radiousd"]:

                for cc_name, cc_code in country_dict.items():
                    thread_list = []
                    for mm in range(1, 13):
                        count += 1
                        # reptile_deal(uu,mm,yy,cc_code,cc_name,rr)
                        t = threading.Thread(target=reptile_deal, args=(uu,mm,yy,cc_code,cc_name,rr))
                        print('年份:' + yy + ';' + '当年处理的数据数量===' + str(count))
                        t.setDaemon(True)
                        thread_list.append(t)

                    if not thread_list:
                        continue
                    for t in thread_list:
                        t.start()
                    for t in thread_list:
                        t.join(20)
                        # print('年份:' + yy + ';' + '当年处理的数据数量===' + str(count))



main()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值