# coding=utf-8
import re
import urllib
import urllib2
from selenium import webdriver
import time
import datetime
import calendar
import sys
from PIL import Image
from PIL import ImageDraw
from PIL import ImageFont
from BeautifulSoup import BeautifulSoup
# import sys
# reload(sys)
# sys.setdefaultencoding('utf8')
import chardet
class Piaofangshujk_05:
site_url = ''
data_table = ''
data_th = []
data_td = []
regMsg = ''
imgtype = ''
imgpath = ''
fileName = ''
img_foren_path = ''
img_back_path = ''
def __unicode__(self):
return self.dd
def downloadHtml(self, url):
response_1 = urllib2.urlopen(url).read()
# 解决乱码问题
mychar = chardet.detect(response_1)
bianma = mychar['encoding']
if bianma == 'utf-8' or bianma == 'UTF-8':
response = response_1
else:
response = response_1.decode('gb2312', 'ignore').encode('utf-8')
self.parseHtml_th(response)
self.parseHtml_td(response)
ths = []
def parseHtml_th(self,html):
h = re.findall(r'<tr>.*</tr>', html, re.M)
for i in h:
objM = re.findall(r'<th>(.*?)</th>', i, re.M)
if objM.__len__() == 10:
for i in objM:
self.ths.append(i)
tds = []
dd = []
def parseHtml_td(self,html):
h = re.findall(r'<tr[^>].*class="even"|class="odd">.*</tr>', html, re.M)
for i in h:
objM = re.findall(r'<td>(.*?)</td>', i, re.M)
if objM.__len__() == 10:
self.tds.append(i)
# print tds.__len__()
for i in self.tds:
tt = []
tss = i.split('</td>')
for ii in tss:
vals = ii.split('<td>')
for iii in vals:
if iii != '' and iii != 'class="odd">' and iii != ' </tr>':
print iii
tt.append(iii)
self.dd.append(tt)
# print self.dd.__len__()
for ff in self.dd:
getAtag = re.match(r'<a.*href=.*title="(.*?)">', ff[0], re.M)
if getAtag:
# print getAtag.group()
# print getAtag.group(1)
ff[0] = getAtag.group(1)
'''
def Screenshots(self,url):
browser = webdriver.Firefox() # Get local session of firefox
browser.set_window_size(1200, 900)
browser.get(url) # Load page
browser.execute_script("""
(function () {
var y = 0;
var step = 100;
window.scroll(0, 0);
function f() {
if (y < document.body.scrollHeight) {
y += step;
window.scroll(0, y);
setTimeout(f, 100);
} else {
window.scroll(0, 0);
document.title += "scroll-done";
}
}
setTimeout(f, 1000);
})();
""")
for i in xrange(30):
if "scroll-done" in browser.title:
break
time.sleep(10)
browser.save_screenshot(self.img_back_path)
browser.close()
def getdatetime(self):
tsts = time.asctime(time.localtime(time.time()))
#2017-05-09_17:27:13
# print tsts
stime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
#print stime
#Tue May 09 17:27:13 2017
''''''
# stime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# fileyy = re.sub(r'\s', '_', stime)
#
# print "----" + fileyy
# ttt = '%f' % time.time()
#
# filexx = re.sub(r'\D', '-', ttt)
# print ttt
# print filexx
# print time.asctime(time.localtime(time.time()))
#
# ttt = '%f' % time.time()
# filexx = re.sub(r'\D', '-', ttt)
# newfile = filexx + ".txt"
# fo = open(newfile, 'wb')
return stime
def getDate(self):
date = datetime.date
def writePic(self):
#write time to picture
font = ImageFont.truetype('simsun',24)
# imageFile = "capture.png"
imgbefore = Image.open(self.img_foren_path)
draw = ImageDraw.Draw(imgbefore)
txt = self.getdatetime
draw.text((160, 200), unicode(txt, encoding='utf-8'), (255, 210, 0), font=font)
draw = ImageDraw.Draw(imgbefore)
imgbefore.save(self.img_foren_path)
return imgbefore
# img.show()
def mergePic(self,filename):
''' '''
Im = Image.open("img_back_path.png")
# print Im.mode,Im.size,Im.format,Im.palette,Im.info
# 图片的背景图片
# newIm = Image.new("RGBA", (460, 480), (255, 0, 0))
Im2 = Image.open(self.img_foren_path).convert(Im.mode)
Im2 = Im2.resize(Im.size)
img = Image.blend(Im,Im2,0.2)
# img.show()
img.save(filename+'-'+self.getdatetime+'jpg','JPEG')
return img
'''
if __name__=='__main__':
p = Piaofangshujk_05()
url = 'http://58921.com/'
p.downloadHtml(url)
# p.parseHtml_table(line)
print p.ths.__len__()
for i in p.ths:
print i
print '---------------------------------'
for ii in p.dd:
print '---------------------------------'
for iii in ii:
print iii
'''
html = urllib2.urlopen("http://www.piaofang168.com/").read()
h = re.findall(r'<img[^>](.*?)width="165" height="166" border="0"(.*?)alt=(\".*\").*>', html, re.M)
dataT=[]
for ty in h:
objMatch = re.match(r'\"(.*?)\".*', ty[2], re.I)
if objMatch:
print "matchObj.group(1)", objMatch.group(1)
dataT.append(objMatch.group(1))
else:
print "no match!"
print str(dataT)
print "时间"+"---"+dataT[0]+"---"+dataT[1]+"---"+dataT[2]+"---"+dataT[3]+"---"+dataT[4]
'''
python抓取几大票房统计系统数据的之专资办票房数据库
最新推荐文章于 2021-05-08 17:22:28 发布