windows下Python操作mysql需要安装的驱动:
实例代码:
# -*- coding: utf-8 -*-
import urllib2
import urllib
import re
import json
import sys
import thread
import time
import MySQLdb.cursors
reload(sys)
sys.setdefaultencoding('utf-8')
class p2pBlack:
def __init__(self):
# 初始页
self.pageNow = 1
# 默认每页显示20条记录
self.showCount = 20
# 网站根目录,用于拼接获取到的url
self.basePath = "http://www.p2pblack.com"
# 列表页url
self.listUrl = "http://www.p2pblack.com/cheat/frontDeadBeatList.html"
# 设置时间格式
self.timeFormat='%Y-%m-%d %X'
#是否继续
self.isgo=True
# 获取页面
def getHtml(self, pageNow, count):
url = self.listUrl + '?currentPage=' + str(pageNow) + '&showCount=' + str(count)
user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
headers = { 'User-Agent' : user_agent }
req = urllib2.Request(url, headers=headers)
response = urllib2.urlopen(req)
page = response.read()
unicodePage = page.decode("utf-8")
return unicodePage
# 获取总条数,加入url中,方便直接查询全部
def getCount(self):
unicodePage = self.getHtml(self.pageNow, self.showCount);
count = re.findall('<font color=red>(.*?)</font>', unicodePage, re.S)
self.showCount = count[0]
return self.showCount
# 获取所有未记录的总数
def getNewCount(self):
enable=True
count=0
allred=0
page=self.pageNow
print u'[info]['+self.nowTime()+']计算未收录信息总条数,第一次加载稍微有点慢:'
open('F://p2plogs/log.txt','a').write('[info]['+self.nowTime()+']计算未收录信息总条数,第一次加载稍微有点慢:\n')
while enable:
unicodePage=self.getHtml(page, self.showCount)
# unicodePage=self.getHtml(28,20)
infoList=re.findall('<b>(.*?)</b>',unicodePage,re.S)
for i in range(len(infoList)):
reCo=self.getInfoById(infoList[i])
if reCo==0:
count =i+1
else:
enable=False
allred=(page-1)*int(self.showCount)+count
print u'[info]['+self.nowTime()+']第%d页%d条数据未获取...' %(page,count)
open('F://p2plogs/log.txt','a').write('[info]['+self.nowTime()+']第%d页%d条数据未获取...' %(page,count)+'\n')
if count==self.showCount:
page +=1
else:
enable=False
print u'[info]['+self.nowTime()+']查询记录数完毕...'
open('F://p2plogs/log.txt','a').write('[info]['+self.nowTime()+']查询记录数完毕...\n')
print u'[info]['+self.nowTime()+']共%d条数据未收录' %(allred)
open('F://p2plogs/log.txt','a').write('[info]['+self.nowTime()+']共%d条数据未收录' %(allred)+'\n')
return allred
# 获取所有记录的详情url
def getAllUrl(self):
count = self.getNewCount()
# count=self.showCount
if count>0:
print u'[info]['+self.nowTime()+']开始查询所有未收录记录的详情url'
open('F://p2plogs/log.txt','a').write('[info]['+self.nowTime()+']开始查询所有未收录记录的详情url\n')
unicodePage = self.getHtml(self.pageNow, count)
urlList = re.findall('<a href=\"(/cheat/showDetail.html.*?)\".*?>', unicodePage, re.S | re.I)
print u'[info]['+self.nowTime()+']共获取到%d条url' %(len(urlList))
open('F://p2plogs/log.txt','a').write('[info]['+self.nowTime()+']共获取到%d条url' %(len(urlList))+'\n')
return urlList
else:
print u'[info]['+self.nowTime()+']没有新记录产生,结束本次。'
open('F://p2plogs/log.txt','a').write('[info]['+self.nowTime()+']没有新记录产生,结束本次。\n')
return False
# 获取详情页
def getDetail(self, url):
req = urllib2.Request(url)
response = urllib2.urlopen(req)
page = response.read()
unicodePage = page.decode("utf-8")
return unicodePage
# 遍历所有记录的详情,并进入详情页抓取数据
def forRecodr(self):
recList = self.getAllUrl()
if recList:
for m in range(len(recList)):
print u'[info]['+self.nowTime()+']整理第%d条记录的信息' %(m+1)
open('F://p2plogs/log.txt','a').write('[info]['+self.nowTime()+']整理第%d条记录的信息' %(m+1)+'\n')
detailUrl = self.basePath + recList[m]
detail = self.getDetail(detailUrl)
# 取出编号等信息id相关信息
detidP = re.findall('<p.*?class="ft_publick_bh">(.*?)</p>', detail, re.S)
detid = re.findall('.*?<span>(.*?)</span>.*?<span.*?class="ft_publick_jicu">(.*?)</span>.*?', detidP[0], re.S)
# 取出欠债人基本信息
detinfodiv = re.findall('<div.*?class="ft_publick_pzxxright ft_publick_pzxxrightss">(.*?)</div>', detail, re.S)
detinfo = re.findall('<p>.*?<span>(.*?)</span>.*?<span>(.*?)</span>.*?</p>', detinfodiv[0], re.S)
valueList = []
cardid = ''
cardinfo = []
infoid = detid[0][1]
# 打印信息
for i in range(len(detid)):
valueList.append(str(detid[i][1]).replace('\t', '').replace('\r', '').replace('\n', ''))
for i in range(len(detinfo)):
#姓名判断
if i==0:
if detinfo[i][1]=='':
print u'[info]['+self.nowTime()+']第%d条记录的姓名为空,判定为垃圾数据,放弃存储。' %(m+1)
open('F://p2plogs/log.txt','a').write('[info]['+self.nowTime()+']第%d条记录的姓名为空,判定为垃圾数据,放弃存储。' %(m+1)+'\n')
break
#欠款判断
if i==3:
if detinfo[i][1]=='元':
print u'[info]['+self.nowTime()+']第%d条记录的欠款为空,判定为垃圾数据,放弃存储。' %(m+1)
open('F://p2plogs/log.txt','a').write('[info]['+self.nowTime()+']第%d条记录的欠款为空,判定为垃圾数据,放弃存储。' %(m+1)+'\n')
break
# 最后一行记录为照片
if i == len(detinfo) - 1:
if detinfo[i][1]:
imgurl = re.findall('<img.*?src="(.*?)".*?style="max-width: 100px;" />', detinfo[i][1], re.S)
# imgurls=''
for j in range(len(imgurl)):
# imgurls=imgurls+imgurl[j]+'\n'
if imgurl[j] != 'null':
# 获取照片名称与后缀
imgname = imgurl[j].split('/')[-1]
# print imgname
# print infoid
# 新建一个线程保存图片
# thread.start_new_thread(self.saveImages,(imgname,infoid,imgurl[j],))
self.saveImages(imgname, infoid, imgurl[j])
# 保存照片到本地
# urllib.urlretrieve(imgurl[j],'F://p2pimages/'+imgname)
# print detinfo[i][0]+'\n'+imgurls
else:
# 处理证件
if i == 1:
cardIdList = str(detinfo[i][1]).split(' ')
if str(cardIdList[0]) == '身份证':
cardid = cardIdList[1]
cardinfo = self.getCaridInfo(cardid)
else:
cardid = detinfo[i][1]
valueList.append(str(cardid).replace(' ', ':').replace('\t', '').replace('\r', '').replace('\n', '').strip())
else:
valueList.append(str(detinfo[i][1]).replace('\t', '').replace('\r', '').replace('\n', '').strip())
if len(cardinfo)==0:
cardinfo=['','','']
valueList.extend(cardinfo)
valueList.append(self.nowTime())
rec = self.saveInfo(valueList)
if rec != 1:
print u'[info]['+self.nowTime()+']插入第%d条信息失败!' %(m+1)
open('F://p2plogs/log.txt','a').write('[info]['+self.nowTime()+']插入第%d条信息失败!' %(m+1)+'\n')
else:
print u'[info]['+self.nowTime()+']插入第%d条信息成功!' %(m+1)
open('F://p2plogs/log.txt','a').write('[info]['+self.nowTime()+']插入第%d条信息成功!' %(m+1)+'\n')
print u'\n查询完毕!'
open('F://p2plogs/log.txt','a').write('[info]['+p2p.nowTime()+']查询完毕!\n')
print u'================================================\n'
open('F://p2plogs/log.txt','a').write('[info]['+p2p.nowTime()+']================================================\n')
print u'回车键再次加载,输入esc结束:\n'
strInput=str(raw_input(''))
if strInput=='esc':
self.isgo=False
#获取链接
def conMysql(self):
return MySQLdb.Connect(host='127.0.0.1', user='root', passwd='root', db='p2pblack', charset='utf8', cursorclass=MySQLdb.cursors.DictCursor)
# 插入数据
def saveInfo(self, values):
print u'[info]['+self.nowTime()+']数据存储中...'
open('F://p2plogs/log.txt','a').write('[info]['+self.nowTime()+']数据存储中...\n')
db = self.conMysql()
cur = db.cursor()
rec = 0
try:
rec = cur.execute("insert into heelsinfo values(null,'%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" % tuple(values))
db.commit()
except Exception, e:
print e
open('F://p2plogs/log.txt','a').write('[error]['+self.nowTime()+'][插入基本信息到数据库时:]'+str(e)+'\n')
db.rollback()
db.close()
return rec
#保存图片
def saveImages(self, imgname, infoid, imgurl):
print u'[info]['+self.nowTime()+']照片:%s存储中...' %(imgname)
open('F://p2plogs/log.txt','a').write('[info]['+self.nowTime()+']照片:%s存储中...' %(imgname)+'\n')
# 保存照片到本地
imagePath = 'F://p2pimages/' + imgname
try:
urllib.urlretrieve(imgurl, imagePath)
except Exception, e:
print e
open('F://p2plogs/log.txt','a').write('[error]['+self.nowTime()+'][存储照片到本地时:]'+str(e)+'\n')
db = self.conMysql()
cur = db.cursor()
try:
cur.execute("insert into images values(null,'%s','%s','%s')" % (infoid, imagePath,self.nowTime()))
db.commit()
except Exception, e:
print e
open('F://p2plogs/log.txt','a').write('[error]['+self.nowTime()+'][插入图片路径到数据库时:]'+str(e)+'\n')
db.rollback()
db.close()
#根据infoid查询记录是否存在
def getInfoById(self,infoid):
db=self.conMysql()
cur = db.cursor()
info=cur.execute("select * from heelsinfo where infoid='%s'" % (infoid))
return info
#身份证信息解析
def getCaridInfo(self, cardid):
strs = ''
try:
req = urllib2.Request('http://apis.juhe.cn/idcard/index?key=596f83b59a28c25f19d071f19490ba84&cardno=' + cardid)
resp = urllib2.urlopen(req)
content = resp.read()
strs = json.loads(content)
infolist = []
if(strs['resultcode'] == '200'):
infolist.append(str(strs['result']['birthday']).strip())
infolist.append(str(strs['result']['sex']).strip())
infolist.append(str(strs['result']['area']).strip())
return infolist
else:
print u'身份证号码:', str(cardid)
print u'返回状态码:', strs['resultcode']
print u'错误状态码:', strs['error_code']
print u'错误消息:', strs['reason']
open('F://p2plogs/log.txt','a').write('[error]['+self.nowTime()+']身份证号码:'+str(cardid)+'\n')
open('F://p2plogs/log.txt','a').write('[error]['+self.nowTime()+']返回状态码:'+str(strs['resultcode'])+'\n')
open('F://p2plogs/log.txt','a').write('[error]['+self.nowTime()+']错误状态码:'+str(strs['error_code'])+'\n')
open('F://p2plogs/log.txt','a').write('[error]['+self.nowTime()+']错误消息:'+str(strs['reason'])+'\n')
return infolist
except Exception, e:
print e
open('F://p2plogs/log.txt','a').write('[error]['+self.nowTime()+'][拆分身份证信息时:]'+str(e)+'\n')
#获取当前系统时间
def nowTime(self):
nowtime=time.strftime(self.timeFormat,time.localtime(time.time()))
return str(nowtime)
#启动程序
def start(self):
print u'开始启动程序......\n'
open('F://p2plogs/log.txt','a').write('[info]['+p2p.nowTime()+']开始启动程序......\n')
while self.isgo:
self.forRecodr()
print u'请按下回车开始执行:'
raw_input(' ')
p2p = p2pBlack()
print u'开始查询数据,请稍后......\n'
open('F://p2plogs/log.txt','a').write('[info]['+p2p.nowTime()+']开始查询数据,请稍后......\n')
p2p.start()
# infoList=p2p.getCaridInfo('150921199402205418')
# if(infoList):
# print infoList
print u'运行完毕,结束程序!\n'
open('F://p2plogs/log.txt','a').write('[info]['+p2p.nowTime()+']运行完毕,结束程序!\n')
print u'************************************************\n'
open('F://p2plogs/log.txt','a').write('[info]['+p2p.nowTime()+']************************************************\n')
raw_input('')