Python爬取p2pblack

windows下Python操作mysql需要安装的驱动:

mysql驱动(64位)


实例代码:

# -*- coding: utf-8 -*-   
import urllib2
import urllib
import re
import json
import sys
import thread
import time
import MySQLdb.cursors
reload(sys)
sys.setdefaultencoding('utf-8')

class p2pBlack:

    def __init__(self):
        # 初始页
        self.pageNow = 1
        # 默认每页显示20条记录
        self.showCount = 20
        # 网站根目录,用于拼接获取到的url
        self.basePath = "http://www.p2pblack.com"
        # 列表页url
        self.listUrl = "http://www.p2pblack.com/cheat/frontDeadBeatList.html"
        # 设置时间格式
        self.timeFormat='%Y-%m-%d %X'
        #是否继续
        self.isgo=True
    
    # 获取页面
    def getHtml(self, pageNow, count):
        url = self.listUrl + '?currentPage=' + str(pageNow) + '&showCount=' + str(count)
        user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'     
        headers = { 'User-Agent' : user_agent } 
        req = urllib2.Request(url, headers=headers)
        response = urllib2.urlopen(req)
        page = response.read()
        unicodePage = page.decode("utf-8")
        return unicodePage
    
    # 获取总条数,加入url中,方便直接查询全部
    def getCount(self):
        unicodePage = self.getHtml(self.pageNow, self.showCount);
        count = re.findall('<font color=red>(.*?)</font>', unicodePage, re.S)
        self.showCount = count[0]
        return self.showCount
    # 获取所有未记录的总数
    def getNewCount(self):
        enable=True
        count=0
        allred=0
        page=self.pageNow
        print u'[info]['+self.nowTime()+']计算未收录信息总条数,第一次加载稍微有点慢:'
        open('F://p2plogs/log.txt','a').write('[info]['+self.nowTime()+']计算未收录信息总条数,第一次加载稍微有点慢:\n')
        while enable:
            unicodePage=self.getHtml(page, self.showCount)
#             unicodePage=self.getHtml(28,20)
            infoList=re.findall('<b>(.*?)</b>',unicodePage,re.S)
            for i in range(len(infoList)):
                reCo=self.getInfoById(infoList[i])
                if reCo==0:
                    count =i+1
                else:
                    enable=False
            allred=(page-1)*int(self.showCount)+count
            print u'[info]['+self.nowTime()+']第%d页%d条数据未获取...' %(page,count)
            open('F://p2plogs/log.txt','a').write('[info]['+self.nowTime()+']第%d页%d条数据未获取...' %(page,count)+'\n')
            if count==self.showCount:
                page +=1
            else:
                enable=False
        print u'[info]['+self.nowTime()+']查询记录数完毕...'
        open('F://p2plogs/log.txt','a').write('[info]['+self.nowTime()+']查询记录数完毕...\n')
        print u'[info]['+self.nowTime()+']共%d条数据未收录' %(allred)
        open('F://p2plogs/log.txt','a').write('[info]['+self.nowTime()+']共%d条数据未收录' %(allred)+'\n')
        return allred
    
    
    # 获取所有记录的详情url
    def getAllUrl(self):
        count = self.getNewCount()
#         count=self.showCount
        if count>0:
            print u'[info]['+self.nowTime()+']开始查询所有未收录记录的详情url'
            open('F://p2plogs/log.txt','a').write('[info]['+self.nowTime()+']开始查询所有未收录记录的详情url\n')
            
            unicodePage = self.getHtml(self.pageNow, count)
            urlList = re.findall('<a href=\"(/cheat/showDetail.html.*?)\".*?>', unicodePage, re.S | re.I)
            
            print u'[info]['+self.nowTime()+']共获取到%d条url' %(len(urlList))
            open('F://p2plogs/log.txt','a').write('[info]['+self.nowTime()+']共获取到%d条url' %(len(urlList))+'\n')
            return urlList
        else:
            print u'[info]['+self.nowTime()+']没有新记录产生,结束本次。'
            open('F://p2plogs/log.txt','a').write('[info]['+self.nowTime()+']没有新记录产生,结束本次。\n')
            return False
    # 获取详情页
    def getDetail(self, url):
        req = urllib2.Request(url)
        response = urllib2.urlopen(req)
        page = response.read()
        unicodePage = page.decode("utf-8")
        return unicodePage
    
    # 遍历所有记录的详情,并进入详情页抓取数据
    def forRecodr(self):
        recList = self.getAllUrl()
        if recList:
            for m in range(len(recList)):
                print u'[info]['+self.nowTime()+']整理第%d条记录的信息' %(m+1)
                open('F://p2plogs/log.txt','a').write('[info]['+self.nowTime()+']整理第%d条记录的信息' %(m+1)+'\n')
                detailUrl = self.basePath + recList[m]
                detail = self.getDetail(detailUrl)
                # 取出编号等信息id相关信息
                detidP = re.findall('<p.*?class="ft_publick_bh">(.*?)</p>', detail, re.S)
                detid = re.findall('.*?<span>(.*?)</span>.*?<span.*?class="ft_publick_jicu">(.*?)</span>.*?', detidP[0], re.S)
                # 取出欠债人基本信息
                detinfodiv = re.findall('<div.*?class="ft_publick_pzxxright ft_publick_pzxxrightss">(.*?)</div>', detail, re.S)
                detinfo = re.findall('<p>.*?<span>(.*?)</span>.*?<span>(.*?)</span>.*?</p>', detinfodiv[0], re.S)
                valueList = []
                cardid = ''
                cardinfo = []
                infoid = detid[0][1]
                # 打印信息
                for i in range(len(detid)):
                    valueList.append(str(detid[i][1]).replace('\t', '').replace('\r', '').replace('\n', ''))
                
                for i in range(len(detinfo)):
                    #姓名判断
                    if i==0:
                        if detinfo[i][1]=='':
                            print u'[info]['+self.nowTime()+']第%d条记录的姓名为空,判定为垃圾数据,放弃存储。' %(m+1)
                            open('F://p2plogs/log.txt','a').write('[info]['+self.nowTime()+']第%d条记录的姓名为空,判定为垃圾数据,放弃存储。' %(m+1)+'\n')
                            break
                    #欠款判断
                    if i==3:
                        if detinfo[i][1]=='元':
                            print u'[info]['+self.nowTime()+']第%d条记录的欠款为空,判定为垃圾数据,放弃存储。' %(m+1)
                            open('F://p2plogs/log.txt','a').write('[info]['+self.nowTime()+']第%d条记录的欠款为空,判定为垃圾数据,放弃存储。' %(m+1)+'\n')
                            break
                    # 最后一行记录为照片
                    if i == len(detinfo) - 1:
                        if detinfo[i][1]:
                            imgurl = re.findall('<img.*?src="(.*?)".*?style="max-width: 100px;" />', detinfo[i][1], re.S)
    #                         imgurls=''
                            for j in range(len(imgurl)):
    #                             imgurls=imgurls+imgurl[j]+'\n'
                                if imgurl[j] != 'null':
                                    # 获取照片名称与后缀
                                    imgname = imgurl[j].split('/')[-1]
    #                                 print imgname
    #                                 print infoid
                                    # 新建一个线程保存图片
    #                                 thread.start_new_thread(self.saveImages,(imgname,infoid,imgurl[j],))
                                    self.saveImages(imgname, infoid, imgurl[j])
                                    # 保存照片到本地
    #                                 urllib.urlretrieve(imgurl[j],'F://p2pimages/'+imgname)
    #                                 print detinfo[i][0]+'\n'+imgurls
                    else:
                        # 处理证件
                        if i == 1:
                            cardIdList = str(detinfo[i][1]).split('     ')
                            if str(cardIdList[0]) == '身份证':
                                cardid = cardIdList[1]
                                cardinfo = self.getCaridInfo(cardid)
                            else:
                                cardid = detinfo[i][1]
                            valueList.append(str(cardid).replace('     ', ':').replace('\t', '').replace('\r', '').replace('\n', '').strip())
                        else:
                            valueList.append(str(detinfo[i][1]).replace('\t', '').replace('\r', '').replace('\n', '').strip())
                            
                if len(cardinfo)==0:
                    cardinfo=['','',''] 
                valueList.extend(cardinfo)
                valueList.append(self.nowTime())
                rec = self.saveInfo(valueList)
                if rec != 1:
                    print u'[info]['+self.nowTime()+']插入第%d条信息失败!' %(m+1)
                    open('F://p2plogs/log.txt','a').write('[info]['+self.nowTime()+']插入第%d条信息失败!' %(m+1)+'\n')
                else:
                    print u'[info]['+self.nowTime()+']插入第%d条信息成功!' %(m+1)
                    open('F://p2plogs/log.txt','a').write('[info]['+self.nowTime()+']插入第%d条信息成功!' %(m+1)+'\n')
        print u'\n查询完毕!'
        open('F://p2plogs/log.txt','a').write('[info]['+p2p.nowTime()+']查询完毕!\n')
        print u'================================================\n'
        open('F://p2plogs/log.txt','a').write('[info]['+p2p.nowTime()+']================================================\n')
        print u'回车键再次加载,输入esc结束:\n'
        strInput=str(raw_input(''))
        if strInput=='esc':
            self.isgo=False
        
        
    #获取链接
    def conMysql(self):
        return  MySQLdb.Connect(host='127.0.0.1', user='root', passwd='root', db='p2pblack', charset='utf8', cursorclass=MySQLdb.cursors.DictCursor)
    
    # 插入数据                
    def saveInfo(self, values):
        print u'[info]['+self.nowTime()+']数据存储中...'
        open('F://p2plogs/log.txt','a').write('[info]['+self.nowTime()+']数据存储中...\n')
        db = self.conMysql()
        cur = db.cursor()
        rec = 0
        try:
            rec = cur.execute("insert into heelsinfo values(null,'%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" % tuple(values))
            db.commit()
        except Exception, e:
            print e
            open('F://p2plogs/log.txt','a').write('[error]['+self.nowTime()+'][插入基本信息到数据库时:]'+str(e)+'\n')
            db.rollback()
        db.close()
        return rec
    #保存图片
    def saveImages(self, imgname, infoid, imgurl):
        print u'[info]['+self.nowTime()+']照片:%s存储中...' %(imgname)
        open('F://p2plogs/log.txt','a').write('[info]['+self.nowTime()+']照片:%s存储中...' %(imgname)+'\n')
        # 保存照片到本地
        imagePath = 'F://p2pimages/' + imgname
        try:
            urllib.urlretrieve(imgurl, imagePath)
        except Exception, e:
            print e
            open('F://p2plogs/log.txt','a').write('[error]['+self.nowTime()+'][存储照片到本地时:]'+str(e)+'\n')
        db = self.conMysql()
        cur = db.cursor()
        try:
            cur.execute("insert into images values(null,'%s','%s','%s')" % (infoid, imagePath,self.nowTime()))
            db.commit()
        except Exception, e:
            print e
            open('F://p2plogs/log.txt','a').write('[error]['+self.nowTime()+'][插入图片路径到数据库时:]'+str(e)+'\n')
            db.rollback()
        db.close()
    #根据infoid查询记录是否存在
    def getInfoById(self,infoid):
        db=self.conMysql()
        cur = db.cursor()
        info=cur.execute("select * from heelsinfo where infoid='%s'" % (infoid))
        return info
    #身份证信息解析
    def getCaridInfo(self, cardid):
        strs = ''
        try:
            req = urllib2.Request('http://apis.juhe.cn/idcard/index?key=596f83b59a28c25f19d071f19490ba84&cardno=' + cardid)
            resp = urllib2.urlopen(req)
            content = resp.read()
            strs = json.loads(content)
            infolist = []
            if(strs['resultcode'] == '200'):
                infolist.append(str(strs['result']['birthday']).strip())
                infolist.append(str(strs['result']['sex']).strip())
                infolist.append(str(strs['result']['area']).strip())
                return infolist
            else:
                print u'身份证号码:', str(cardid)
                print u'返回状态码:', strs['resultcode']
                print u'错误状态码:', strs['error_code']
                print u'错误消息:', strs['reason']
                open('F://p2plogs/log.txt','a').write('[error]['+self.nowTime()+']身份证号码:'+str(cardid)+'\n')
                open('F://p2plogs/log.txt','a').write('[error]['+self.nowTime()+']返回状态码:'+str(strs['resultcode'])+'\n')
                open('F://p2plogs/log.txt','a').write('[error]['+self.nowTime()+']错误状态码:'+str(strs['error_code'])+'\n')
                open('F://p2plogs/log.txt','a').write('[error]['+self.nowTime()+']错误消息:'+str(strs['reason'])+'\n')
                return infolist
        except Exception, e:
            print e
            open('F://p2plogs/log.txt','a').write('[error]['+self.nowTime()+'][拆分身份证信息时:]'+str(e)+'\n')
    #获取当前系统时间
    def nowTime(self):
        nowtime=time.strftime(self.timeFormat,time.localtime(time.time()))
        return str(nowtime)
    
    #启动程序
    def start(self):
        print u'开始启动程序......\n'
        open('F://p2plogs/log.txt','a').write('[info]['+p2p.nowTime()+']开始启动程序......\n')
        while self.isgo:
            self.forRecodr()
    
print u'请按下回车开始执行:'      
raw_input(' ')
p2p = p2pBlack()
print u'开始查询数据,请稍后......\n'
open('F://p2plogs/log.txt','a').write('[info]['+p2p.nowTime()+']开始查询数据,请稍后......\n')
p2p.start()
# infoList=p2p.getCaridInfo('150921199402205418')
# if(infoList):
#     print infoList
print u'运行完毕,结束程序!\n'
open('F://p2plogs/log.txt','a').write('[info]['+p2p.nowTime()+']运行完毕,结束程序!\n')
print u'************************************************\n'
open('F://p2plogs/log.txt','a').write('[info]['+p2p.nowTime()+']************************************************\n')
raw_input('')



写的很乱,有空记得需要整理。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值