从多个猫眼源代码中获取woff文件,从上表分析可以看出相同数字的不同编码的坐标个数是不一样了,网上以前的代码是不能用了,所以很苦恼该怎么办。
从这篇中https://blog.csdn.net/qq_43153418/article/details/104043760,学习到了处理方法,在这里感谢一下。
下面贴出我的代码,具体分析可以到上面的文章中看
import requests
from lxml import etree
from fontTools.ttLib import TTFont
import re
import numpy as np
import os
# 源代码中的数字或中文是这个形式的:就是字体反爬
class maoyanspider:
def __init__(self):
self.headers = {
'Cookie': '__mta=121536567.1584593627741.1584593897024.1584777886424.8; uuid_n_v=v1; uuid=9B00E310699D11EA8A5063633091ACD9C38C5F45C9C9415380787B14353AC5AB; _csrf=f9260793472f1ebbc423f176336b484b9f605a7f1a2bb7d3436bafe00faf4957; _lxsdk_cuid=170f123e8a4b6-03647939c6392d-5a4c2571-1bcab9-170f123e8a5c8; _lxsdk=9B00E310699D11EA8A5063633091ACD9C38C5F45C9C9415380787B14353AC5AB; mojo-uuid=18e67de5f62ed4a876d4b99ab7123426; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1584593627,1584593644; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; __mta=121536567.1584593627741.1584593634819.1584593643928.3; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1584777886; mojo-session-id={"id":"5c14a6b56d13e6fcd9262030cdd640d4","time":1584781442462}; mojo-trace-id=3; _lxsdk_s=170fc575718-34-2dd-b31%7C%7C1'
,'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0'}
self.base_url = 'https://maoyan.com/board/6?offset='
self.path = ''
self.file_name = 'fonts_maoyan.woff'
self.movies = []
def spider(self,num):
print(f'正在爬取第{num+1}页....')
resp_txt = requests.get(self.base_url+str(num*10),headers=self.headers).text
# print(resp_txt)
print('下载woff并写入中...')
# 得到woff文件并写入
font_url = 'http:' + re.search(r"url\('(.*\.woff)'\) format", resp_txt).group(1)
font_file = requests.get(font_url).content
with open(self.path + self.file_name, 'wb') as f:
f.write(font_file)
fonts_dict=self.getFonts(self.file_name) # 得到在线的对应表
print('匹配成功...')
# 一可以选择在全文中替换
# print('查找全文替换中...')
# for i in re.findall(r'(&#x[\w]{4};)', resp_txt):
# resp_txt = resp_txt.replace(i, fonts_dict[i])
# 二可以找出来后再替换
utf_list = [eval(r"'\u" + x[3:7] + "'") for x in fonts_dict.keys()] # 转换格式\u....
utf_dict = {i: j for i, j in zip(utf_list, fonts_dict.values())}
lists = etree.HTML(resp_txt)
dds = lists.xpath('//dl[@class="board-wrapper"]/dd')
for dd in dds:
movie = {}
name = dd.xpath('.//div[@class="movie-item-info"]/p[@class="name"]/a/text()')[0]
star = dd.xpath('.//div[@class="movie-item-info"]/p[@class="star"]/text()')
star = '主演:暂无' if star == [] else star[0].replace('主演:','')
releasetime = dd.xpath('.//div[@class="movie-item-info"]/p[@class="releasetime"]/text()')[0].replace('上映时间:','')
new_see = ''.join(dd.xpath('.//div[@class="movie-item-number wish"]/p[1]//text()')).replace('本月新增想看:','').replace('人','')
for i in utf_dict.keys(): # 找出来再替换
new_see = new_see.replace(i,utf_dict[i])
all_see = ''.join(dd.xpath('.//div[@class="movie-item-number wish"]/p[2]//text()')).replace('总想看:','').replace('人','')
for i in utf_dict.keys():
all_see = all_see.replace(i,utf_dict[i])
movie['片名'] = name
movie['主演'] = star
movie['上映时间'] = releasetime
movie['本月新增想看'] = new_see
movie['总想看'] = all_see
self.movies.append(movie)
print(name,star,releasetime,new_see,all_see)
# 得到对应坐标和uni列表
def getCoordinatesUnilist(self,fontsFile):
fonts = TTFont(f'{fontsFile}')
# fonts.saveXML(f"{fontsFile.split('.')[0]}.xml") # 生成xml,从中可以观察到编码关系
# uni_list = fonts.getGlyphNames()[1:-1] # 得到uni列表,和下行的效果一样
uni_list = fonts.getGlyphOrder()[2:]
# print(uni_list)
gcs_objs = [] # 坐标对象组
for uni in uni_list:
gcs_obj= [] # 一个坐标对象
gcs_val = fonts['glyf'][uni].coordinates # 获取一个uni的坐标
for i in gcs_val:
gcs_obj.append(i) #[( , ),] 添加进一个坐标对象
# print(uni,len(gcs_obj))
gcs_objs.append(gcs_obj) # [[( , ),],[( , ),],[( , ),],] 添加进坐标对象组
# print(gcs_objs) # 10个
return uni_list,gcs_objs
# 得到欧式距离
def getEuclideanDistance(self,axis1,axis2):
if len(axis1) < len(axis2):
axis1.extend([0, 0] for _ in range(len(axis2) - len(axis1)))
elif len(axis2) < len(axis1):
axis2.extend([0, 0] for _ in range(len(axis1) - len(axis2)))
axis1 = np.array(axis1)
axis2 = np.array(axis2)
return np.sqrt(np.sum(np.square(axis1 - axis2)))
# 得到在线的uni和值得关系
def getFonts(self,fonts):
# 这个需要动手从 base_uni_list 找出对应的关系
base_fonts_dict = {'uniE0C6':'7', 'uniE3B7':'6', 'uniF7A5':'3', 'uniEF65':'5', 'uniF4C2':'0', 'uniEFE2':'1', 'uniE623':'2', 'uniEBEE':'4', 'uniF544':'8', 'uniE489':'9'}
base_uni_list,base_gcs_objs=self.getCoordinatesUnilist('basefonts.woff') # 本地的woff
# print(base_uni_list,'\n',base_gcs_objs)
online_uni_list,online_gcs_objs=self.getCoordinatesUnilist(fonts) # 下载下来的本地的woff
# print(online_uni_list,'\n',online_gcs_objs)
online_fonts_dict = {}
for i in range(len(online_uni_list)):
min_avg, uni = 99999, None
for j in range(len(base_uni_list)):
avg = self.getEuclideanDistance(online_gcs_objs[i], base_gcs_objs[j])
# print(avg)
if avg < min_avg: # 取最小值
min_avg = avg
uni = base_uni_list[j] # 最后得到的就是字形最相似的
online_fonts_dict['&#x' + online_uni_list[i][3:].lower() + ';'] = base_fonts_dict[uni]
# print(online_fonts_dict)
return online_fonts_dict
def run(self):
for i in range(3):
self.spider(i)
print('爬取完成!!!')
print(self.movies)
# os.remove(self.file_name)
if __name__ == '__main__':
my=maoyanspider()
my.run()
# 以下是我调试时简写的程序
# html = """
#
# <!DOCTYPE html>
#
# <!--[if IE 8]><html class="ie8"><![endif]-->
# <!--[if IE 9]><html class="ie9"><![endif]-->
# <!--[if gt IE 9]><!--><html><!--<![endif]-->
# <head>
# <title>最受期待榜 - 猫眼电影 - 一网打尽好电影</title>
#
# <link rel="dns-prefetch" href="//p0.meituan.net" />
# <link rel="dns-prefetch" href="//p1.meituan.net" />
# <link rel="dns-prefetch" href="//ms0.meituan.net" />
# <link rel="dns-prefetch" href="//s0.meituan.net" />
# <link rel="dns-prefetch" href="//ms1.meituan.net" />
# <link rel="dns-prefetch" href="//analytics.meituan.com" />
# <link rel="dns-prefetch" href="//report.meituan.com" />
# <link rel="dns-prefetch" href="//frep.meituan.com" />
#
#
# <meta charset="utf-8">
# <meta name="keywords" content="猫眼电影,电影排行榜,热映口碑榜,最受期待榜,国内票房榜,北美票房榜,猫眼TOP100">
# <meta name="description" content="猫眼电影热门榜单,包括热映口碑榜,最受期待榜,国内票房榜,北美票房榜,猫眼TOP100,多维度为用户进行选片决策">
# <meta http-equiv="cleartype" content="yes" />
# <meta http-equiv="X-UA-Compatible" content="IE=edge" />
# <meta name="renderer" content="webkit" />
#
# <meta name="HandheldFriendly" content="true" />
# <meta name="format-detection" content="email=no" />
# <meta name="format-detection" content="telephone=no" />
# <meta name="viewport" content="width=device-width, initial-scale=1">
#
#
# <script>"use strict";!function(){var i=0<arguments.length&&void 0!==arguments[0]?arguments[0]:"_Owl_",n=window;n[i]||(n[i]={isRunning:!1,isReady:!1,preTasks:[],dataSet:[],use:function(i,t){this.isReady&&n.Owl&&n.Owl[i](t),this.preTasks.push({api:i,data:[t]})},add:function(i){this.dataSet.push(i)},run:function(){var t=this;if(!this.isRunning){this.isRunning=!0;var i=n.onerror;n.onerror=function(){this.isReady||this.add({type:"jsError",data:arguments}),i&&i.apply(n,arguments)}.bind(this),(n.addEventListener||n.attachEvent)("error",function(i){t.isReady||t.add({type:"resError",data:[i]})},!0)}}},n[i].run())}();</script>
# <script>
# cid = "c_wx6zb55";
# ci = 457;
# val = {"subnavId":6}; window.system = {};
#
# window.openPlatform = '';
# window.openPlatformSub = '';
# window.$mtsiFlag = '0';
#
# </script>
# <link rel="stylesheet" href="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/common.d1d257d3.css"/>
# <link rel="stylesheet" href="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/board-index.92a06072.css"/>
# <script crossorigin="anonymous" src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/stat.88d57c80.js"></script>
# <script>if(window.devicePixelRatio >= 2) { document.write('<link rel="stylesheet" href="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/image-2x.8ba7074d.css"/>') }</script>
# <style>
# @font-face {
# font-family: stonefont;
# src: url('//vfile.meituan.net/colorstone/80f815d9d98bbfd977e414d58c4ff7873412.eot');
# src: url('//vfile.meituan.net/colorstone/80f815d9d98bbfd977e414d58c4ff7873412.eot?#iefix') format('embedded-opentype'),
# url('//vfile.meituan.net/colorstone/7f5b3cdbb5e113cd03f89656f664b8312276.woff') format('woff');
# }
#
# .stonefont {
# font-family: stonefont;
# }
# </style>
# <script>
# var _hmt = _hmt || [];
# (function() {
# var hm = document.createElement("script");
# hm.src = "https://hm.baidu.com/hm.js?703e94591e87be68cc8da0da7cbd0be2";
# var s = document.getElementsByTagName("script")[0];
# s.parentNode.insertBefore(hm, s);
# })();
# </script>
# </head>
# <body>
#
#
# <div class="header">
# <div class="header-inner">
# <a href="//maoyan.com" class="logo" data-act="icon-click"></a>
# <div class="city-container" data-val="{currentcityid:457 }">
# <div class="city-selected">
# <div class="city-name">
# 温岭
# <span class="caret"></span>
# </div>
# </div>
# <div class="city-list" data-val="{ localcityid: 457 }">
# <div class="city-list-header">定位城市:<a class="js-geo-city" data-ci="457">温岭</a></div>
#
# </div>
# </div>
#
#
# <div class="nav">
# <ul class="navbar">
# <li><a href="/" data-act="home-click" >首页</a></li>
# <li><a href="/films" data-act="movies-click" >电影</a></li>
# <li><a href="/cinemas" data-act="cinemas-click" >影院</a></li>
# <li><a href="http://www.gewara.com">演出</a></li>
#
# <li><a href="/board" data-act="board-click" class="active" >榜单</a></li>
# <li><a href="/news" data-act="hotNews-click" >热点</a></li>
# <li><a href="/edimall" >商城</a></li>
# </ul>
# </div>
#
# <div class="user-info">
# <div class="user-avatar J-login">
# <img src="https://p0.meituan.net/movie/7dd82a16316ab32c8359debdb04396ef2897.png">
# <span class="caret"></span>
# <ul class="user-menu no-login-menu">
# <li><a href="javascript:void 0">登录</a></li>
# </ul>
# </div>
# </div>
#
# <form action="/query" target="_blank" class="search-form" data-actform="search-click">
# <input name="kw" class="search" type="search" maxlength="32" placeholder="找影视剧、影人、影院" autocomplete="off">
# <input class="submit" type="submit" value="">
# </form>
#
# <div class="app-download">
# <a href="/app" target="_blank">
# <span class="iphone-icon"></span>
# <span class="apptext">APP下载</span>
# <span class="caret"></span>
# <div class="download-icon">
# <p class="down-title">扫码下载APP</p>
# <p class='down-content'>选座更优惠</p>
# </div>
# </a>
# </div>
#
# </div>
# </div>
# <div class="header-placeholder"></div>
#
# <div class="subnav">
# <ul class="navbar">
# <li>
# <a data-act="subnav-click" data-val="{subnavClick:7}"
# href="/board/7"
# >热映口碑榜</a>
# </li>
# <li>
# <a data-act="subnav-click" data-val="{subnavClick:6}"
# data-state-val="{subnavId:6}"
# class="active" href="javascript:void(0);"
# >最受期待榜</a>
# </li>
# <li>
# <a data-act="subnav-click" data-val="{subnavClick:1}"
# href="/board/1"
# >国内票房榜</a>
# </li>
# <li>
# <a data-act="subnav-click" data-val="{subnavClick:2}"
# href="/board/2"
# >北美票房榜</a>
# </li>
# <li>
# <a data-act="subnav-click" data-val="{subnavClick:4}"
# href="/board/4"
# >TOP100榜</a>
# </li>
# </ul>
# </div>
#
#
# <div class="container" id="app" class="page-board/index" >
#
# <div class="content">
# <div class="wrapper">
# <div class="main">
# <p class="update-time">2020-03-22<span class="has-fresh-text">已更新</span></p>
# <p class="board-content">榜单规则:将昨日国内待映影片,按照之前30天的想看数总量从高到低排列取前50名,每天上午10点更新。相关数据来源于“猫眼电影库”。</p>
# <dl class="board-wrapper">
# <dd>
# <i class="board-index board-index-1">1</i>
# <a href="/films/1217023" title="唐人街探案3" class="image-link" data-act="boarditem-click" data-val="{movieId:1217023}">
# <img src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/image/loading_2.e3d934bf.png" alt="" class="poster-default" />
# <img data-src="https://p0.meituan.net/movie/b0e0d6ce9914f37e7f9f6ade13e096342721150.jpg@160w_220h_1e_1c" alt="唐人街探案3" class="board-img" />
# </a>
# <div class="board-item-main">
# <div class="board-item-content">
# <div class="movie-item-info">
# <p class="name"><a href="/films/1217023" title="唐人街探案3" data-act="boarditem-click" data-val="{movieId:1217023}">唐人街探案3</a></p>
# <p class="star">主演:王宝强,刘昊然,妻夫木聪</p><p class="releasetime">上映时间:2020</p> </div>
# <div class="movie-item-number wish">
# <p class="month-wish">本月新增想看:<span><span class="stonefont"></span></span>人</p>
# <p class="total-wish">总想看:<span><span class="stonefont"></span></span>人</p>
# </div>
#
# </div>
# </div>
#
# </dd>
# <dd>
# <i class="board-index board-index-2">2</i>
# <a href="/films/1211269" title="姜子牙" class="image-link" data-act="boarditem-click" data-val="{movieId:1211269}">
# <img src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/image/loading_2.e3d934bf.png" alt="" class="poster-default" />
# <img data-src="https://p0.meituan.net/movie/3dc44919f0917b5823c867813f29fba42625689.jpg@160w_220h_1e_1c" alt="姜子牙" class="board-img" />
# </a>
# <div class="board-item-main">
# <div class="board-item-content">
# <div class="movie-item-info">
# <p class="name"><a href="/films/1211269" title="姜子牙" data-act="boarditem-click" data-val="{movieId:1211269}">姜子牙</a></p>
# <p class="star">主演:郑希,杨凝,图特哈蒙</p><p class="releasetime">上映时间:2020</p> </div>
# <div class="movie-item-number wish">
# <p class="month-wish">本月新增想看:<span><span class="stonefont"></span></span>人</p>
# <p class="total-wish">总想看:<span><span class="stonefont"></span></span>人</p>
# </div>
#
# </div>
# </div>
#
# </dd>
# <dd>
# <i class="board-index board-index-3">3</i>
# <a href="/films/1298859" title="木兰:横空出世" class="image-link" data-act="boarditem-click" data-val="{movieId:1298859}">
# <img src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/image/loading_2.e3d934bf.png" alt="" class="poster-default" />
# <img data-src="https://p1.meituan.net/moviemachine/3b8ab40ddd5e499a6c2924f7c34b03702316978.jpg@160w_220h_1e_1c" alt="木兰:横空出世" class="board-img" />
# </a>
# <div class="board-item-main">
# <div class="board-item-content">
# <div class="movie-item-info">
# <p class="name"><a href="/films/1298859" title="木兰:横空出世" data-act="boarditem-click" data-val="{movieId:1298859}">木兰:横空出世</a></p>
# <p class="releasetime">上映时间:2020-04</p> </div>
# <div class="movie-item-number wish">
# <p class="month-wish">本月新增想看:<span><span class="stonefont"></span></span>人</p>
# <p class="total-wish">总想看:<span><span class="stonefont"></span></span>人</p>
# </div>
#
# </div>
# </div>
#
# </dd>
# <dd>
# <i class="board-index board-index-4">4</i>
# <a href="/films/248585" title="西游记真假美猴王" class="image-link" data-act="boarditem-click" data-val="{movieId:248585}">
# <img src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/image/loading_2.e3d934bf.png" alt="" class="poster-default" />
# <img data-src="https://p0.meituan.net/movie/88dff6a7d97c6a91a125cece93ba72411017894.jpg@160w_220h_1e_1c" alt="西游记真假美猴王" class="board-img" />
# </a>
# <div class="board-item-main">
# <div class="board-item-content">
# <div class="movie-item-info">
# <p class="name"><a href="/films/248585" title="西游记真假美猴王" data-act="boarditem-click" data-val="{movieId:248585}">西游记真假美猴王</a></p>
# <p class="star">主演:六小龄童,马德华</p><p class="releasetime">上映时间:2020</p> </div>
# <div class="movie-item-number wish">
# <p class="month-wish">本月新增想看:<span><span class="stonefont"></span></span>人</p>
# <p class="total-wish">总想看:<span><span class="stonefont"></span></span>人</p>
# </div>
#
# </div>
# </div>
#
# </dd>
# <dd>
# <i class="board-index board-index-5">5</i>
# <a href="/films/461076" title="紧急救援" class="image-link" data-act="boarditem-click" data-val="{movieId:461076}">
# <img src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/image/loading_2.e3d934bf.png" alt="" class="poster-default" />
# <img data-src="https://p1.meituan.net/moviemachine/8c38e239e10bc0b3db738b563dbb64a91093960.png@160w_220h_1e_1c" alt="紧急救援" class="board-img" />
# </a>
# <div class="board-item-main">
# <div class="board-item-content">
# <div class="movie-item-info">
# <p class="name"><a href="/films/461076" title="紧急救援" data-act="boarditem-click" data-val="{movieId:461076}">紧急救援</a></p>
# <p class="star">主演:彭于晏,王彦霖,辛芷蕾</p><p class="releasetime">上映时间:2020</p> </div>
# <div class="movie-item-number wish">
# <p class="month-wish">本月新增想看:<span><span class="stonefont"></span></span>人</p>
# <p class="total-wish">总想看:<span><span class="stonefont"></span></span>人</p>
# </div>
#
# </div>
# </div>
#
# </dd>
# <dd>
# <i class="board-index board-index-6">6</i>
# <a href="/films/1218142" title="拆弹专家2" class="image-link" data-act="boarditem-click" data-val="{movieId:1218142}">
# <img src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/image/loading_2.e3d934bf.png" alt="" class="poster-default" />
# <img data-src="https://p0.meituan.net/movie/ccea56ae49249d482be5997aa98e94691344390.jpg@160w_220h_1e_1c" alt="拆弹专家2" class="board-img" />
# </a>
# <div class="board-item-main">
# <div class="board-item-content">
# <div class="movie-item-info">
# <p class="name"><a href="/films/1218142" title="拆弹专家2" data-act="boarditem-click" data-val="{movieId:1218142}">拆弹专家2</a></p>
# <p class="star">主演:刘德华,刘青云,倪妮</p><p class="releasetime">上映时间:2020-07</p> </div>
# <div class="movie-item-number wish">
# <p class="month-wish">本月新增想看:<span><span class="stonefont"></span></span>人</p>
# <p class="total-wish">总想看:<span><span class="stonefont"></span></span>人</p>
# </div>
#
# </div>
# </div>
#
# </dd>
# <dd>
# <i class="board-index board-index-7">7</i>
# <a href="/films/1217123" title="夺冠" class="image-link" data-act="boarditem-click" data-val="{movieId:1217123}">
# <img src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/image/loading_2.e3d934bf.png" alt="" class="poster-default" />
# <img data-src="https://p0.meituan.net/moviemachine/6d175dd08812a4ff3f048f7b7e4ed449405105.jpg@160w_220h_1e_1c" alt="夺冠" class="board-img" />
# </a>
# <div class="board-item-main">
# <div class="board-item-content">
# <div class="movie-item-info">
# <p class="name"><a href="/films/1217123" title="夺冠" data-act="boarditem-click" data-val="{movieId:1217123}">夺冠</a></p>
# <p class="star">主演:巩俐,黄渤,吴刚</p><p class="releasetime">上映时间:2020</p> </div>
# <div class="movie-item-number wish">
# <p class="month-wish">本月新增想看:<span><span class="stonefont"></span></span>人</p>
# <p class="total-wish">总想看:<span><span class="stonefont"></span></span>人</p>
# </div>
#
# </div>
# </div>
#
# </dd>
# <dd>
# <i class="board-index board-index-8">8</i>
# <a href="/films/344450" title="爵迹2" class="image-link" data-act="boarditem-click" data-val="{movieId:344450}">
# <img src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/image/loading_2.e3d934bf.png" alt="" class="poster-default" />
# <img data-src="https://p1.meituan.net/movie/eed97cfe2718d28d688615830856d07b226733.jpg@160w_220h_1e_1c" alt="爵迹2" class="board-img" />
# </a>
# <div class="board-item-main">
# <div class="board-item-content">
# <div class="movie-item-info">
# <p class="name"><a href="/films/344450" title="爵迹2" data-act="boarditem-click" data-val="{movieId:344450}">爵迹2</a></p>
# <p class="star">主演:范冰冰,吴亦凡,陈学冬</p><p class="releasetime">上映时间:2020</p> </div>
# <div class="movie-item-number wish">
# <p class="month-wish">本月新增想看:<span><span class="stonefont"></span></span>人</p>
# <p class="total-wish">总想看:<span><span class="stonefont"></span></span>人</p>
# </div>
#
# </div>
# </div>
#
# </dd>
# <dd>
# <i class="board-index board-index-9">9</i>
# <a href="/films/1216053" title="急先锋" class="image-link" data-act="boarditem-click" data-val="{movieId:1216053}">
# <img src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/image/loading_2.e3d934bf.png" alt="" class="poster-default" />
# <img data-src="https://p0.meituan.net/movie/2d4835513a81f6121189e1c3800eb1a3647939.jpg@160w_220h_1e_1c" alt="急先锋" class="board-img" />
# </a>
# <div class="board-item-main">
# <div class="board-item-content">
# <div class="movie-item-info">
# <p class="name"><a href="/films/1216053" title="急先锋" data-act="boarditem-click" data-val="{movieId:1216053}">急先锋</a></p>
# <p class="star">主演:成龙,杨洋,艾伦</p><p class="releasetime">上映时间:2020</p> </div>
# <div class="movie-item-number wish">
# <p class="month-wish">本月新增想看:<span><span class="stonefont"></span></span>人</p>
# <p class="total-wish">总想看:<span><span class="stonefont"></span></span>人</p>
# </div>
#
# </div>
# </div>
#
# </dd>
# <dd>
# <i class="board-index board-index-10">10</i>
# <a href="/films/1230199" title="小妇人" class="image-link" data-act="boarditem-click" data-val="{movieId:1230199}">
# <img src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/image/loading_2.e3d934bf.png" alt="" class="poster-default" />
# <img data-src="https://p0.meituan.net/movie/f09d25b5dc2d0080fa93db9ffaaf4c5a6389423.jpg@160w_220h_1e_1c" alt="小妇人" class="board-img" />
# </a>
# <div class="board-item-main">
# <div class="board-item-content">
# <div class="movie-item-info">
# <p class="name"><a href="/films/1230199" title="小妇人" data-act="boarditem-click" data-val="{movieId:1230199}">小妇人</a></p>
# <p class="star">主演:西尔莎·罗南,艾玛·沃特森,佛罗伦斯·珀</p><p class="releasetime">上映时间:2020</p> </div>
# <div class="movie-item-number wish">
# <p class="month-wish">本月新增想看:<span><span class="stonefont"></span></span>人</p>
# <p class="total-wish">总想看:<span><span class="stonefont"></span></span>人</p>
# </div>
#
# </div>
# </div>
#
# </dd>
# </dl>
#
# </div>
# <div class="pager-main">
#
#
# <ul class="list-pager">
#
#
#
# <li class="active">
# <a class="page_1"
# href="javascript:void(0);" style="cursor: default"
# >1</a>
#
# </li>
# <li >
# <a class="page_2"
# href="?offset=10"
# >2</a>
#
# </li>
# <li >
# <a class="page_3"
# href="?offset=20"
# >3</a>
#
# </li>
# <li >
# <a class="page_4"
# href="?offset=30"
# >4</a>
#
# </li>
# <li >
# <a class="page_5"
# href="?offset=40"
# >5</a>
#
# </li>
#
#
# <li> <a class="page_2"
# href="?offset=10"
# >下一页</a>
# </li>
# </ul>
#
#
# </div>
# </div>
# </div>
#
# </div>
#
# <div class="footer">
# <p class="friendly-links">
# 关于猫眼 :
# <a href="http://ir.maoyan.com/s/index.php#pageScroll0" target="_blank">关于我们</a>
# <span></span>
# <a href="http://ir.maoyan.com/s/index.php#pageScroll1" target="_blank">管理团队</a>
# <span></span>
# <a href="http://ir.maoyan.com/s/index.php#pageScroll2" target="_blank">投资者关系</a>
#
# 友情链接 :
# <a href="http://www.meituan.com" data-query="utm_source=wwwmaoyan" target="_blank">美团网</a>
# <span></span>
# <a href="http://www.gewara.com" data-query="utm_source=wwwmaoyan">格瓦拉</a>
# <span></span>
# <a href="http://i.meituan.com/client" data-query="utm_source=wwwmaoyan" target="_blank">美团下载</a>
# <span></span>
# <a href="https://www.huanxi.com" data-query="utm_source=maoyan_pc" target="_blank">欢喜首映</a>
# </p>
# <p class="friendly-links">
# 商务合作邮箱:v@maoyan.com
# 客服电话:10105335
# 违法和不良信息举报电话:4006018900
# </p>
# <p class="friendly-links">
# 用户投诉邮箱:tousujubao@meituan.com
# 舞弊线索举报邮箱:wubijubao@maoyan.com
# </p>
# <p class="friendly-links credentials">
# <a href="/about/licence/1" target="_blank">中华人民共和国增值电信业务经营许可证 京B2-20190350</a>
# <span></span>
# <a href="/about/licence/4" target="_blank">营业性演出许可证 京演(机构)(2019)4094号</a>
# </p>
# <p class="friendly-links credentials">
# <a href="/about/licence/3" target="_blank">广播电视节目制作经营许可证 (京)字第08478号</a>
# <span></span>
# <a href="/about/licence/2" target="_blank">网络文化经营许可证 京网文(2019)3837-369号 </a>
# </p>
# <p class="friendly-links credentials">
# <a href="/rules/agreement" target="_blank">猫眼用户服务协议 </a>
# <span></span>
# <a href="/rules/rule" target="_blank">猫眼平台交易规则总则 </a>
# <span></span>
# <a href="/rules/privacy" target="_blank">隐私政策 </a>
# </p>
# <p class="friendly-links credentials">
# <a href="http://www.beian.gov.cn/portal/registerSystemInfo?recordcode=11010102003232" target="_blank">京公网安备
# 11010102003232号</a>
# <span></span>
# <a href="http://www.beian.miit.gov.cn/" target="_blank">京ICP备16022489号</a>
# </p>
# <p>北京猫眼文化传媒有限公司</p>
# <p>
# ©<span class="my-footer-year">2016</span>
# 猫眼电影 maoyan.com</p>
# <div class="certificate">
# <a href="http://sq.ccm.gov.cn:80/ccnt/sczr/service/business/emark/toDetail/350CF8BCA8416C4FE0530140A8C0957E"
# target="_blank">
# <img src="http://p0.meituan.net/moviemachine/e54374ccf134d1f7b2c5b075a74fca525326.png" />
# </a>
# <a href="/about/licence/5" target="_blank">
# <img src="http://p1.meituan.net/moviemachine/805f605d5cf1b1a02a4e3a5e29df003b8376.png" />
# </a>
# </div>
# </div>
#
# <script crossorigin="anonymous" src="//www.dpfile.com/app/owl/static/owl_1.7.11.js"></script>
# <script>
# Owl.start({
# project: 'com.sankuai.movie.fe.mywww',
# pageUrl: location.href.split('?')[0].replace(/\/\d+/g, '/:id'),
# devMode: false
# })
# </script>
# <script src="//s0.meituan.net/bs/?f=myfe/canary:mojo-0.1.2.js"></script>
# <script>
# MAInit({
# appkey: 'com.sankuai.movie.fe.mywww',
# app_name: 'maoyan-pc-web',
# app_version: '1.0.0',
# })
# </script>
# <!--[if IE 8]><script crossorigin="anonymous" src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/es5-shim.bbad933f.js"></script><![endif]-->
# <!--[if IE 8]><script crossorigin="anonymous" src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/es5-sham.d6ea26f4.js"></script><![endif]-->
# <script crossorigin="anonymous" src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/common.3c7410fb.js"></script>
# <script crossorigin="anonymous" src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/board-index.e144d497.js"></script>
# </body>
# </html>
#
# """
# dict = {'': '9', '': '8', '': '1', '': '3', '': '2', '': '7',
# '': '0', '': '5', '': '4', '': '6'}
#
# utf_list = [eval(r"'\u" + x[3:7] + "'") for x in dict.keys()] # 转换格式
# utf_dict = {i: j for i, j in zip(utf_list, dict.values())}
#
# for i in re.findall(r'(&#x[\w]{4};)', html):
# html = html.replace(i, dict[i])
#
# aa = etree.HTML(html)
# dds = aa.xpath('//dl[@class="board-wrapper"]/dd')
# for dd in dds:
# name = dd.xpath('.//div[@class="movie-item-info"]/p[@class="name"]/a/text()')[0]
# star = dd.xpath('.//div[@class="movie-item-info"]/p[@class="star"]/text()')
# star = '暂无' if star == [] else star[0].replace('主演:', '')
# releasetime = dd.xpath('.//div[@class="movie-item-info"]/p[@class="releasetime"]/text()')[0].replace('上映时间:', '')
# new_see = ''.join(dd.xpath('.//div[@class="movie-item-number wish"]/p[1]//text()')).replace('本月新增想看:', '').replace(
# '人', '')
# # for i in utf_dict.keys():
# # new_see = new_see.replace(i,utf_dict[i])
# all_see = ''.join(dd.xpath('.//div[@class="movie-item-number wish"]/p[2]//text()')).replace('总想看:', '').replace('人',
# '')
# # for i in utf_dict.keys():
# # all_see = all_see.replace(i,utf_dict[i])
# print(name, star, releasetime)
# print(new_see, all_see)