爬取商户编码——招行

最新推荐文章于 2023-05-29 17:27:44 发布

小小蒲公英

最新推荐文章于 2023-05-29 17:27:44 发布

阅读量581

点赞数

分类专栏：爬虫 Python

本文链接：https://blog.csdn.net/weixin_39777626/article/details/101995308

版权

Python 同时被 2 个专栏收录

120 篇文章 5 订阅

订阅专栏

爬虫

39 篇文章 3 订阅

订阅专栏

import requests
from lxml import etree
import pandas as pd
from sqlalchemy import create_engine
import time
import random


connect=create_engine('mysql+pymysql://user:password@xx:xx/xx?charset=utf8',encoding='utf-8')
table=pd.io.sql.read_sql_table('mcc_craw', connect)

table1=table[:10000]

result_A =pd.DataFrame(columns=('mcc_code','acquirer_org','mch_area','mch_type2','mch_type1','mch_fee_rate_old','mch_fee_rate_new','coll_date','cmbchina_sugg'))
result_B =pd.DataFrame(columns=('mcc_code','publish_bank','publish_month','is_black','coll_date'))
result_C =pd.DataFrame(columns=('mcc_code','publish_bank','coll_date'))
result_D =pd.DataFrame(columns=('mcc_code','publish_bank','coll_date'))

fail=[]
for mcc in table1['TXT_MCHT']:
    date=time.strftime("%Y-%m-%d", time.localtime())
    if len(mcc)==15:
        url='http://mcc.hiwbb.com/?code='+mcc
    else:
        url='http://mcc.hiwbb.com/?code='+mcc[4:]
    try:
        print(url)
        r=requests.get(url).text
        comments=etree.HTML(r)
#--A----------------------------------------------------------------------------------------
        #收单机构
        a1=comments.xpath('/html/body/div/div/div[3]/div/div/code[2]/text()')[0]
        #商户地区
        a2=comments.xpath('/html/body/div/div/div[3]/div/div/code[3]/text()')[0]
        try:
            #商户类型
            a3=comments.xpath('/html/body/div/div/div[3]/div/div/code[4]/text()')[0]
            #商户分类
            a4=comments.xpath('/html/body/div/div/div[3]/div/div/code[5]/text()')[0]
            #商户费率
            a56=comments.xpath('/html/body/div/div/div[3]/div/div/code[6]/text()')[0]
            a5=a56[a56.find('现')+1:a56.find('%')]
            a6=a56[a56.find('原')+1:a56.rfind('%')]
        except:
            a3=None
            a4=None
            a5=None
            a6=None
        #招行建议
        try:
            try:
                a7=comments.xpath('/html/body/div/div/div[3]/div/div/div[10]/table/tr[1]/td[3]/span/text()')[0]
            except:
                a7=comments.xpath('/html/body/div/div/div[3]/div/div/div[3]/table/tr[1]/td[3]/span/text()')[0]
        except:
            a7=comments.xpath('/html/body/div/div/div[3]/div/div/div[10]/table/tr/td/span/text()')[0]
        result_a =pd.DataFrame({'mcc_code':[mcc],'acquirer_org':[a1],'mch_area':[a2],'mch_type2':[a3],'mch_type1':[a4],'mch_fee_rate_old':[a5],'mch_fee_rate_new':[a6],'coll_date':[date],'cmbchina_sugg':[a7]})
        result_A=result_A.append(result_a,ignore_index=True)
        result_A=result_A[['mcc_code','acquirer_org','mch_area','mch_type2','mch_type1','mch_fee_rate_old','mch_fee_rate_new','coll_date','cmbchina_sugg']]

        print('爬取%s成功！'%mcc)
        time.sleep(random.randint(0,100)*0.03)
    except:
        fail.append(mcc)
        rest=random.randint(1 , 5)
        print('休息%s秒钟'%rest)
        time.sleep(rest)

pd.io.sql.to_sql(result_A,'m1_mcc_info',connect,schema='ccia',index=False,if_exists='append')

错误处理

for indexs in result_A.index: 
    for i in range(len(result_A.loc[indexs].values)): 
        if(result_A.loc[indexs].values[i] =='原0.38'): 
            print(indexs,i) 
            print(result_A.loc[indexs].values[i])