python爬虫-通过身份证识别地区信息

# -*- coding: utf-8 -*-
"""
Created on Thu Jan 03 10:26:02 2019

@author: johnson.zhong
"""
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import time
import requests
from lxml import etree
time1=time.time()
import pandas as pd


df = pd.read_csv('e:/shen.txt', sep='\t', header=None, dtype=str, na_filter=False)
print df
idcard=[]
sex1 = []
birthday1 = []
address1 = []

length=len(df)
for i in range(0,length):
    try:
        print df.iloc[i,0]
        idcard.append(df.iloc[i,0])
        url="http://qq.ip138.com/idsearch/index.asp?action=idcard&userid="+df.iloc[i,0]+"&B1=%B2%E9+%D1%AF"
        html=requests.get(url).content
        selector=etree.HTML(html)
        sex=selector.xpath('//td[@class="tdc2"][1]/text()')
        for each in sex:
            print each
            sex1.append(each)

        birthday=selector.xpath('//td[@class="tdc2"][2]/text()')
        for each in birthday:
            print each
            birthday1.append(each)

        address=selector.xpath('//td[@class="tdc2"][3]/text()')
        for each in address:
            print each
            address1.append(each)
    except Exception, ex:
        print Exception, ":", ex

data=pd.DataFrame({'idcard':idcard,'sex':sex1,'birthday':birthday1,'address':address1})
print (data)
pd.DataFrame.to_excel(data,"F:\\person_card.xlsx",header=True,encoding='gbk',index=False)
time2=time.time()
print u'ok,爬虫结束!'
print u'总共耗时:'+str(time2-time1)+'s'

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值