python抓取几大票房统计系统数据的之专资办票房数据库

最新推荐文章于 2021-05-08 17:22:28 发布

千码君2016

最新推荐文章于 2021-05-08 17:22:28 发布

阅读量1.5k

点赞数

分类专栏： python 文章标签： python selenium

本文链接：https://blog.csdn.net/shunzi2016/article/details/78112142

版权

python 专栏收录该内容

118 篇文章 1 订阅

订阅专栏

# coding=utf-8
import re
import urllib
import urllib2
from selenium import webdriver
import time
import datetime
import calendar
import sys
from PIL import Image
from PIL import ImageDraw
from PIL import ImageFont
from BeautifulSoup import BeautifulSoup
# import sys
# reload(sys)
# sys.setdefaultencoding('utf8')
import chardet
class Piaofangshujk_05:
    site_url = ''
    data_table = ''
    data_th = []
    data_td = []
    regMsg = ''
    imgtype = ''
    imgpath = ''
    fileName = ''
    img_foren_path = ''
    img_back_path = ''
    def __unicode__(self):
        return self.dd
    def downloadHtml(self, url):
        response_1 = urllib2.urlopen(url).read()
        # 解决乱码问题
        mychar = chardet.detect(response_1)
        bianma = mychar['encoding']
        if bianma == 'utf-8' or bianma == 'UTF-8':
            response = response_1
        else:
            response = response_1.decode('gb2312', 'ignore').encode('utf-8')

        self.parseHtml_th(response)
        self.parseHtml_td(response)
    ths = []
    def parseHtml_th(self,html):
        h = re.findall(r'<tr>.*</tr>', html, re.M)
        for i in h:
            objM = re.findall(r'<th>(.*?)</th>', i, re.M)
            if objM.__len__() == 10:
                for i in objM:

                    self.ths.append(i)
    tds = []
    dd = []
    def parseHtml_td(self,html):
        h = re.findall(r'<tr[^>].*class="even"|class="odd">.*</tr>', html, re.M)
        for i in h:
            objM = re.findall(r'<td>(.*?)</td>', i, re.M)
            if objM.__len__() == 10:
                self.tds.append(i)
        # print tds.__len__()
        for i in self.tds:
            tt = []
            tss = i.split('</td>')
            for ii in tss:
                vals = ii.split('<td>')
                for iii in vals:
                    if iii != '' and iii != 'class="odd">' and iii != ' </tr>':
                        print iii
                        tt.append(iii)
            self.dd.append(tt)
        # print self.dd.__len__()
        for ff in self.dd:
            getAtag = re.match(r'<a.*href=.*title="(.*?)">', ff[0], re.M)
            if getAtag:
                # print getAtag.group()
                # print getAtag.group(1)
                ff[0] = getAtag.group(1)
'''
    def Screenshots(self,url):
        browser = webdriver.Firefox()  # Get local session of firefox
        browser.set_window_size(1200, 900)
        browser.get(url)  # Load page
        browser.execute_script("""
                (function () {
                    var y = 0;
                    var step = 100;
                    window.scroll(0, 0);

                    function f() {
                        if (y < document.body.scrollHeight) {
                            y += step;
                            window.scroll(0, y);
                            setTimeout(f, 100);
                        } else {
                            window.scroll(0, 0);
                            document.title += "scroll-done";
                        }
                    }

                    setTimeout(f, 1000);
                })();
            """)

        for i in xrange(30):
            if "scroll-done" in browser.title:
                break
            time.sleep(10)
        browser.save_screenshot(self.img_back_path)
        browser.close()
    def getdatetime(self):

        tsts = time.asctime(time.localtime(time.time()))
        #2017-05-09_17:27:13
        # print tsts
        stime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        #print stime
        #Tue May 09 17:27:13 2017
        ''''''

        # stime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        # fileyy = re.sub(r'\s', '_', stime)
        #
        # print "----" + fileyy
        # ttt = '%f' % time.time()
        #
        # filexx = re.sub(r'\D', '-', ttt)
        # print ttt
        # print filexx
        # print time.asctime(time.localtime(time.time()))
        #
        # ttt = '%f' % time.time()
        # filexx = re.sub(r'\D', '-', ttt)
        # newfile = filexx + ".txt"
        # fo = open(newfile, 'wb')
        return stime
    def getDate(self):
        date = datetime.date


    def writePic(self):
        #write time to picture
        font = ImageFont.truetype('simsun',24)
        # imageFile = "capture.png"
        imgbefore = Image.open(self.img_foren_path)
        draw = ImageDraw.Draw(imgbefore)
        txt = self.getdatetime
        draw.text((160, 200), unicode(txt, encoding='utf-8'), (255, 210, 0), font=font)
        draw = ImageDraw.Draw(imgbefore)
        imgbefore.save(self.img_foren_path)
        return imgbefore
        # img.show()

    def mergePic(self,filename):
        ''' '''
        Im = Image.open("img_back_path.png")

        # print Im.mode,Im.size,Im.format,Im.palette,Im.info

        # 图片的背景图片
        # newIm = Image.new("RGBA", (460, 480), (255, 0, 0))

        Im2 = Image.open(self.img_foren_path).convert(Im.mode)
        Im2 = Im2.resize(Im.size)
        img = Image.blend(Im,Im2,0.2)
        # img.show()
        img.save(filename+'-'+self.getdatetime+'jpg','JPEG')
        return img
        '''

if __name__=='__main__':
    p = Piaofangshujk_05()
    url = 'http://58921.com/'
    p.downloadHtml(url)
    # p.parseHtml_table(line)
    print p.ths.__len__()
    for i in p.ths:
        print i
    print '---------------------------------'
    for ii in p.dd:
        print '---------------------------------'
        for iii in ii:
            print iii




'''
html = urllib2.urlopen("http://www.piaofang168.com/").read()
h = re.findall(r'<img[^>](.*?)width="165" height="166" border="0"(.*?)alt=(\".*\").*>', html, re.M)
dataT=[]
for ty in h:
    objMatch = re.match(r'\"(.*?)\".*', ty[2], re.I)
    if objMatch:
        print "matchObj.group(1)", objMatch.group(1)
        dataT.append(objMatch.group(1))
    else:
        print "no match!"
print str(dataT)
print "时间"+"---"+dataT[0]+"---"+dataT[1]+"---"+dataT[2]+"---"+dataT[3]+"---"+dataT[4]
'''