python提取html_python(初学提取html页面元素,借用老师)

-*- coding: utf-8 -*-

import urllib2

import os

def mean_audience_score(id):

arv = 0.0

sc_url = "http://movie.mtime.com/" + id + "/"

sc_req = urllib2.Request(sc_url, headers={‘User-Agent‘: "Magic Browser"})

sc_page = urllib2.urlopen(sc_req)

sc_strw = sc_page.read()

sc_str = re.findall(r‘+\d+\.+\d+‘, sc_strw)

if len(sc_str) == 0:

return arv

for tt in sc_str:

scsc = re.findall(r‘\d+\.+\d‘, tt)

arv += float(scsc[0])

return arv / len(sc_str)

url = ‘http://theater.mtime.com/China_Anhui_Province_Wuhu/‘

req = urllib2.Request(url,headers={‘User-Agent‘ : "Magic Browser"})

webpage = urllib2.urlopen(req)

strw = webpage.read()*0

print strw

tg_start = strw.find(‘hotplaySvList = [‘)

print tg_start

if tg_start == -1:

print ‘not find start tag‘

os._exit(0)

tmp = strw[tg_start:-1]

print tmp

tg_end = tmp.find(‘;‘)

print tg_end

if tg_end == -1 :

print ‘not find end tag‘

os._exit(0)

tmp = tmp[len(‘hotplaySvList = [‘):tg_end]

print tmp

tar_ls = tmp.split("},{")

dict_film = {}

for t0 in tar_ls:

ls_t = t0.split(‘,‘)

id = ls_t[0].split(‘:‘)[-1].strip()

film = ls_t[-1].split(‘"‘)[-2].strip()

dict_film[id] = film

for t in dict_film:

print "id:" + t + " film:" + dict_film[t]

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值