import requests
from lxml import etree
import pandas as pd
from sqlalchemy import create_engine
import time
import random
connect=create_engine('mysql+pymysql://user:password@xx:xx/xx?charset=utf8',encoding='utf-8')
table=pd.io.sql.read_sql_table('mcc_craw', connect)
table1=table[:10000]
result_A =pd.DataFrame(columns=('mcc_code','acquirer_org','mch_area','mch_type2','mch_type1','mch_fee_rate_old','mch_fee_rate_new','coll_date','cmbchina_sugg'))
result_B =pd.DataFrame(columns=('mcc_code','publish_bank','publish_month','is_black','coll_date'))
result_C =pd.DataFrame(columns=('mcc_code','publish_bank','coll_date'))
result_D =pd.DataFrame(columns=('mcc_code','publish_bank','coll_date'))
fail=[]
for mcc in table1['TXT_MCHT']:
date=time.strftime("%Y-%m-%d", time.localtime())
if len(mcc)==15:
url='http://mcc.hiwbb.com/?code='+mcc
else:
url='http://mcc.hiwbb.com/?code='+mcc[4:]
try:
print(url)
r=requests.get(url).text
comments=etree.HTML(r)
#--A----------------------------------------------------------------------------------------
#收单机构
a1=comments.xpath('/html/body/div/div/div[3]/div/div/code[2]/text()')[0]
#商户地区
a2=comments.xpath('/html/body/div/div/div[3]/div/div/code[3]/text()')[0]
try:
#商户类型
a3=comments.xpath('/html/body/div/div/div[3]/div/div/code[4]/text()')[0]
#商户分类
a4=comments.xpath('/html/body/div/div/div[3]/div/div/code[5]/text()')[0]
#商户费率
a56=comments.xpath('/html/body/div/div/div[3]/div/div/code[6]/text()')[0]
a5=a56[a56.find('现')+1:a56.find('%')]
a6=a56[a56.find('原')+1:a56.rfind('%')]
except:
a3=None
a4=None
a5=None
a6=None
#招行建议
try:
try:
a7=comments.xpath('/html/body/div/div/div[3]/div/div/div[10]/table/tr[1]/td[3]/span/text()')[0]
except:
a7=comments.xpath('/html/body/div/div/div[3]/div/div/div[3]/table/tr[1]/td[3]/span/text()')[0]
except:
a7=comments.xpath('/html/body/div/div/div[3]/div/div/div[10]/table/tr/td/span/text()')[0]
result_a =pd.DataFrame({'mcc_code':[mcc],'acquirer_org':[a1],'mch_area':[a2],'mch_type2':[a3],'mch_type1':[a4],'mch_fee_rate_old':[a5],'mch_fee_rate_new':[a6],'coll_date':[date],'cmbchina_sugg':[a7]})
result_A=result_A.append(result_a,ignore_index=True)
result_A=result_A[['mcc_code','acquirer_org','mch_area','mch_type2','mch_type1','mch_fee_rate_old','mch_fee_rate_new','coll_date','cmbchina_sugg']]
print('爬取%s成功!'%mcc)
time.sleep(random.randint(0,100)*0.03)
except:
fail.append(mcc)
rest=random.randint(1 , 5)
print('休息%s秒钟'%rest)
time.sleep(rest)
pd.io.sql.to_sql(result_A,'m1_mcc_info',connect,schema='ccia',index=False,if_exists='append')
错误处理
for indexs in result_A.index:
for i in range(len(result_A.loc[indexs].values)):
if(result_A.loc[indexs].values[i] =='原0.38'):
print(indexs,i)
print(result_A.loc[indexs].values[i])
更多爬虫实例请见 https://blog.csdn.net/weixin_39777626/article/details/81564819