# coding=utf-8
import re
import urllib2
import chardet
class Yiendianyingzhiku_01:
dataT = []
def downloadHtml(self,url):
response_1 = urllib2.urlopen(url).read()
# 解决乱码问题
mychar = chardet.detect(response_1)
bianma = mychar['encoding']
print bianma
if bianma == 'utf-8' or bianma == 'UTF-8':
response = response_1
print 'xx'
else:
response = response_1.decode('gb2312', 'ignore').encode('utf-8')
self.parse(response)
def parse(self,response):
h2 = re.findall(r"<td style='width.*?'>[^<img].*</td>", response, re.M)
for i in h2:
objM = re.match(r"<td style='width.*?'>(.*)</td>",i,re.M)
if objM:
# print objM.group(1)
self.dataT.append(objM.group(1))
self.merge()
dataY = []
def merge(self):
for i in range(len(self.dataT)-1):
if i%6 == 0:
if i == 0:
dataX = [self.dataT[0], self.dataT[1], self.dataT[2], self.dataT[3], self.dataT[4], self.dataT[5]]
else:
dataX = [self.dataT[0+i], self.dataT[1+i], self.dataT[2+i], self.dataT[3+i], self.dataT[4+i], self.dataT[5+i]]
self.dataY.append(dataX)
else:
pass
if __name__ == '__main__':
url = 'http://www.cbooo.cn/'
yien = Yiendianyingzhiku_01()
yien.downloadHtml(url)
for i in yien.dataY:
print i[0],i[3]
python抓取几大票房统计系统数据的之艺恩电影数据
最新推荐文章于 2024-01-24 16:54:39 发布