Python猫眼电影-字体反爬最新版

从多个猫眼源代码中获取woff文件,从上表分析可以看出相同数字的不同编码的坐标个数是不一样了,网上以前的代码是不能用了,所以很苦恼该怎么办。

从这篇中https://blog.csdn.net/qq_43153418/article/details/104043760,学习到了处理方法,在这里感谢一下。

下面贴出我的代码,具体分析可以到上面的文章中看

import requests
from lxml import etree
from fontTools.ttLib import TTFont
import re
import numpy as np
import os


#  源代码中的数字或中文是这个形式的:就是字体反爬

class maoyanspider:

    def __init__(self):
        self.headers = {
            'Cookie': '__mta=121536567.1584593627741.1584593897024.1584777886424.8; uuid_n_v=v1; uuid=9B00E310699D11EA8A5063633091ACD9C38C5F45C9C9415380787B14353AC5AB; _csrf=f9260793472f1ebbc423f176336b484b9f605a7f1a2bb7d3436bafe00faf4957; _lxsdk_cuid=170f123e8a4b6-03647939c6392d-5a4c2571-1bcab9-170f123e8a5c8; _lxsdk=9B00E310699D11EA8A5063633091ACD9C38C5F45C9C9415380787B14353AC5AB; mojo-uuid=18e67de5f62ed4a876d4b99ab7123426; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1584593627,1584593644; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; __mta=121536567.1584593627741.1584593634819.1584593643928.3; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1584777886; mojo-session-id={"id":"5c14a6b56d13e6fcd9262030cdd640d4","time":1584781442462}; mojo-trace-id=3; _lxsdk_s=170fc575718-34-2dd-b31%7C%7C1'
            ,'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0'}
        self.base_url = 'https://maoyan.com/board/6?offset='
        self.path = ''
        self.file_name = 'fonts_maoyan.woff'
        self.movies = []

    def spider(self,num):
        print(f'正在爬取第{num+1}页....')
        resp_txt = requests.get(self.base_url+str(num*10),headers=self.headers).text
        # print(resp_txt)
        print('下载woff并写入中...')
        # 得到woff文件并写入
        font_url = 'http:' + re.search(r"url\('(.*\.woff)'\) format", resp_txt).group(1)
        font_file = requests.get(font_url).content
        with open(self.path + self.file_name, 'wb') as f:
            f.write(font_file)
        fonts_dict=self.getFonts(self.file_name)        # 得到在线的对应表
        print('匹配成功...')
        # 一可以选择在全文中替换
        # print('查找全文替换中...')
        # for i in re.findall(r'(&#x[\w]{4};)', resp_txt):
        #     resp_txt = resp_txt.replace(i, fonts_dict[i])

        # 二可以找出来后再替换
        utf_list = [eval(r"'\u" + x[3:7] + "'") for x in fonts_dict.keys()]  # 转换格式\u....
        utf_dict = {i: j for i, j in zip(utf_list, fonts_dict.values())}

        lists = etree.HTML(resp_txt)
        dds = lists.xpath('//dl[@class="board-wrapper"]/dd')
        for dd in dds:
            movie = {}
            name = dd.xpath('.//div[@class="movie-item-info"]/p[@class="name"]/a/text()')[0]
            star = dd.xpath('.//div[@class="movie-item-info"]/p[@class="star"]/text()')
            star = '主演:暂无' if star == [] else star[0].replace('主演:','')
            releasetime = dd.xpath('.//div[@class="movie-item-info"]/p[@class="releasetime"]/text()')[0].replace('上映时间:','')
            new_see = ''.join(dd.xpath('.//div[@class="movie-item-number wish"]/p[1]//text()')).replace('本月新增想看:','').replace('人','')
            for i in utf_dict.keys():    # 找出来再替换
                new_see = new_see.replace(i,utf_dict[i])
            all_see = ''.join(dd.xpath('.//div[@class="movie-item-number wish"]/p[2]//text()')).replace('总想看:','').replace('人','')
            for i in utf_dict.keys():
                all_see = all_see.replace(i,utf_dict[i])

            movie['片名'] = name
            movie['主演'] = star
            movie['上映时间'] = releasetime
            movie['本月新增想看'] = new_see
            movie['总想看'] = all_see
            self.movies.append(movie)
            print(name,star,releasetime,new_see,all_see)

    # 得到对应坐标和uni列表
    def getCoordinatesUnilist(self,fontsFile):
        fonts = TTFont(f'{fontsFile}')
        # fonts.saveXML(f"{fontsFile.split('.')[0]}.xml")  # 生成xml,从中可以观察到编码关系
        # uni_list = fonts.getGlyphNames()[1:-1]        # 得到uni列表,和下行的效果一样
        uni_list = fonts.getGlyphOrder()[2:]
        # print(uni_list)
        gcs_objs = []                                   # 坐标对象组
        for uni in uni_list:
            gcs_obj= []                                 # 一个坐标对象
            gcs_val = fonts['glyf'][uni].coordinates    # 获取一个uni的坐标
            for i in gcs_val:
                gcs_obj.append(i)                       #[( , ),] 添加进一个坐标对象
            # print(uni,len(gcs_obj))
            gcs_objs.append(gcs_obj)                    # [[( , ),],[( , ),],[( , ),],] 添加进坐标对象组
        # print(gcs_objs)                               # 10个
        return uni_list,gcs_objs

    # 得到欧式距离
    def getEuclideanDistance(self,axis1,axis2):
        if len(axis1) < len(axis2):
            axis1.extend([0, 0] for _ in range(len(axis2) - len(axis1)))
        elif len(axis2) < len(axis1):
            axis2.extend([0, 0] for _ in range(len(axis1) - len(axis2)))
        axis1 = np.array(axis1)
        axis2 = np.array(axis2)
        return np.sqrt(np.sum(np.square(axis1 - axis2)))

    # 得到在线的uni和值得关系
    def getFonts(self,fonts):
        # 这个需要动手从 base_uni_list 找出对应的关系
        base_fonts_dict = {'uniE0C6':'7', 'uniE3B7':'6', 'uniF7A5':'3', 'uniEF65':'5', 'uniF4C2':'0', 'uniEFE2':'1', 'uniE623':'2', 'uniEBEE':'4', 'uniF544':'8', 'uniE489':'9'}
        base_uni_list,base_gcs_objs=self.getCoordinatesUnilist('basefonts.woff')    # 本地的woff
        # print(base_uni_list,'\n',base_gcs_objs)
        online_uni_list,online_gcs_objs=self.getCoordinatesUnilist(fonts)           # 下载下来的本地的woff
        # print(online_uni_list,'\n',online_gcs_objs)
        online_fonts_dict = {}
        for i in range(len(online_uni_list)):
            min_avg, uni = 99999, None
            for j in range(len(base_uni_list)):
                avg = self.getEuclideanDistance(online_gcs_objs[i], base_gcs_objs[j])
                # print(avg)
                if avg < min_avg:   # 取最小值
                    min_avg = avg
                    uni = base_uni_list[j]   # 最后得到的就是字形最相似的
            online_fonts_dict['&#x' + online_uni_list[i][3:].lower() + ';'] = base_fonts_dict[uni]
        # print(online_fonts_dict)
        return online_fonts_dict

    def run(self):
        for i in range(3):
            self.spider(i)
        print('爬取完成!!!')
        print(self.movies)
        # os.remove(self.file_name)

if __name__ == '__main__':
    my=maoyanspider()
    my.run()




# 以下是我调试时简写的程序
# html = """
#
# <!DOCTYPE html>
#
# <!--[if IE 8]><html class="ie8"><![endif]-->
# <!--[if IE 9]><html class="ie9"><![endif]-->
# <!--[if gt IE 9]><!--><html><!--<![endif]-->
# <head>
#   <title>最受期待榜 - 猫眼电影 - 一网打尽好电影</title>
#
#   <link rel="dns-prefetch" href="//p0.meituan.net"  />
#   <link rel="dns-prefetch" href="//p1.meituan.net"  />
#   <link rel="dns-prefetch" href="//ms0.meituan.net" />
#   <link rel="dns-prefetch" href="//s0.meituan.net" />
#   <link rel="dns-prefetch" href="//ms1.meituan.net" />
#   <link rel="dns-prefetch" href="//analytics.meituan.com" />
#   <link rel="dns-prefetch" href="//report.meituan.com" />
#   <link rel="dns-prefetch" href="//frep.meituan.com" />
#
#
#   <meta charset="utf-8">
#   <meta name="keywords" content="猫眼电影,电影排行榜,热映口碑榜,最受期待榜,国内票房榜,北美票房榜,猫眼TOP100">
#   <meta name="description" content="猫眼电影热门榜单,包括热映口碑榜,最受期待榜,国内票房榜,北美票房榜,猫眼TOP100,多维度为用户进行选片决策">
#   <meta http-equiv="cleartype" content="yes" />
#   <meta http-equiv="X-UA-Compatible" content="IE=edge" />
#   <meta name="renderer" content="webkit" />
#
#   <meta name="HandheldFriendly" content="true" />
#   <meta name="format-detection" content="email=no" />
#   <meta name="format-detection" content="telephone=no" />
#   <meta name="viewport" content="width=device-width, initial-scale=1">
#
#
#   <script>"use strict";!function(){var i=0<arguments.length&&void 0!==arguments[0]?arguments[0]:"_Owl_",n=window;n[i]||(n[i]={isRunning:!1,isReady:!1,preTasks:[],dataSet:[],use:function(i,t){this.isReady&&n.Owl&&n.Owl[i](t),this.preTasks.push({api:i,data:[t]})},add:function(i){this.dataSet.push(i)},run:function(){var t=this;if(!this.isRunning){this.isRunning=!0;var i=n.onerror;n.onerror=function(){this.isReady||this.add({type:"jsError",data:arguments}),i&&i.apply(n,arguments)}.bind(this),(n.addEventListener||n.attachEvent)("error",function(i){t.isReady||t.add({type:"resError",data:[i]})},!0)}}},n[i].run())}();</script>
#   <script>
#   cid = "c_wx6zb55";
#   ci = 457;
# val = {"subnavId":6};    window.system = {};
#
#   window.openPlatform = '';
#   window.openPlatformSub = '';
#   window.$mtsiFlag = '0';
#
#   </script>
#   <link rel="stylesheet" href="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/common.d1d257d3.css"/>
# <link rel="stylesheet" href="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/board-index.92a06072.css"/>
#   <script crossorigin="anonymous" src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/stat.88d57c80.js"></script>
#   <script>if(window.devicePixelRatio >= 2) { document.write('<link rel="stylesheet" href="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/image-2x.8ba7074d.css"/>') }</script>
#   <style>
#     @font-face {
#       font-family: stonefont;
#       src: url('//vfile.meituan.net/colorstone/80f815d9d98bbfd977e414d58c4ff7873412.eot');
#       src: url('//vfile.meituan.net/colorstone/80f815d9d98bbfd977e414d58c4ff7873412.eot?#iefix') format('embedded-opentype'),
#            url('//vfile.meituan.net/colorstone/7f5b3cdbb5e113cd03f89656f664b8312276.woff') format('woff');
#     }
#
#     .stonefont {
#       font-family: stonefont;
#     }
#   </style>
#   <script>
#   var _hmt = _hmt || [];
#   (function() {
#   var hm = document.createElement("script");
#   hm.src = "https://hm.baidu.com/hm.js?703e94591e87be68cc8da0da7cbd0be2";
#   var s = document.getElementsByTagName("script")[0];
#   s.parentNode.insertBefore(hm, s);
#   })();
#   </script>
# </head>
# <body>
#
#
# <div class="header">
#   <div class="header-inner">
#           <a href="//maoyan.com" class="logo" data-act="icon-click"></a>
#         <div class="city-container" data-val="{currentcityid:457 }">
#             <div class="city-selected">
#                 <div class="city-name">
#                   温岭
#                   <span class="caret"></span>
#                 </div>
#             </div>
#             <div class="city-list" data-val="{ localcityid: 457 }">
#                 <div class="city-list-header">定位城市:<a class="js-geo-city" data-ci="457">温岭</a></div>
#
#             </div>
#         </div>
#
#
#         <div class="nav">
#             <ul class="navbar">
#                 <li><a href="/" data-act="home-click"  >首页</a></li>
#                 <li><a href="/films" data-act="movies-click" >电影</a></li>
#                 <li><a href="/cinemas" data-act="cinemas-click" >影院</a></li>
#                 <li><a href="http://www.gewara.com">演出</a></li>
#
#                 <li><a href="/board" data-act="board-click"  class="active" >榜单</a></li>
#                 <li><a href="/news" data-act="hotNews-click" >热点</a></li>
#                 <li><a href="/edimall"  >商城</a></li>
#             </ul>
#         </div>
#
#         <div class="user-info">
#             <div class="user-avatar J-login">
#               <img src="https://p0.meituan.net/movie/7dd82a16316ab32c8359debdb04396ef2897.png">
#               <span class="caret"></span>
#               <ul class="user-menu no-login-menu">
#                 <li><a href="javascript:void 0">登录</a></li>
#               </ul>
#             </div>
#         </div>
#
#         <form action="/query" target="_blank" class="search-form" data-actform="search-click">
#             <input name="kw" class="search" type="search" maxlength="32" placeholder="找影视剧、影人、影院" autocomplete="off">
#             <input class="submit" type="submit" value="">
#         </form>
#
#         <div class="app-download">
#           <a href="/app" target="_blank">
#             <span class="iphone-icon"></span>
#             <span class="apptext">APP下载</span>
#             <span class="caret"></span>
#             <div class="download-icon">
#                 <p class="down-title">扫码下载APP</p>
#                 <p class='down-content'>选座更优惠</p>
#             </div>
#           </a>
#         </div>
#
#   </div>
# </div>
# <div class="header-placeholder"></div>
#
# <div class="subnav">
#   <ul class="navbar">
#     <li>
#       <a data-act="subnav-click" data-val="{subnavClick:7}"
#           href="/board/7"
#       >热映口碑榜</a>
#     </li>
#     <li>
#       <a data-act="subnav-click" data-val="{subnavClick:6}"
#           data-state-val="{subnavId:6}"
#           class="active" href="javascript:void(0);"
#       >最受期待榜</a>
#     </li>
#     <li>
#       <a data-act="subnav-click" data-val="{subnavClick:1}"
#           href="/board/1"
#       >国内票房榜</a>
#     </li>
#     <li>
#       <a data-act="subnav-click" data-val="{subnavClick:2}"
#           href="/board/2"
#       >北美票房榜</a>
#     </li>
#     <li>
#       <a data-act="subnav-click" data-val="{subnavClick:4}"
#           href="/board/4"
#       >TOP100榜</a>
#     </li>
#   </ul>
# </div>
#
#
#     <div class="container" id="app" class="page-board/index" >
#
# <div class="content">
#     <div class="wrapper">
#         <div class="main">
#             <p class="update-time">2020-03-22<span class="has-fresh-text">已更新</span></p>
#             <p class="board-content">榜单规则:将昨日国内待映影片,按照之前30天的想看数总量从高到低排列取前50名,每天上午10点更新。相关数据来源于“猫眼电影库”。</p>
#             <dl class="board-wrapper">
#                 <dd>
#                         <i class="board-index board-index-1">1</i>
#     <a href="/films/1217023" title="唐人街探案3" class="image-link" data-act="boarditem-click" data-val="{movieId:1217023}">
#       <img src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/image/loading_2.e3d934bf.png" alt="" class="poster-default" />
#       <img data-src="https://p0.meituan.net/movie/b0e0d6ce9914f37e7f9f6ade13e096342721150.jpg@160w_220h_1e_1c" alt="唐人街探案3" class="board-img" />
#     </a>
#     <div class="board-item-main">
#       <div class="board-item-content">
#               <div class="movie-item-info">
#         <p class="name"><a href="/films/1217023" title="唐人街探案3" data-act="boarditem-click" data-val="{movieId:1217023}">唐人街探案3</a></p>
# <p class="star">主演:王宝强,刘昊然,妻夫木聪</p><p class="releasetime">上映时间:2020</p>    </div>
#     <div class="movie-item-number wish">
#         <p class="month-wish">本月新增想看:<span><span class="stonefont">&#xe7b5;&#xe388;&#xe388;&#xecfb;&#xe7b5;</span></span>人</p>
#         <p class="total-wish">总想看:<span><span class="stonefont">&#xf737;&#xf2f8;&#xf0bd;&#xf0bd;&#xf0bd;&#xe388;&#xf5a5;</span></span>人</p>
#     </div>
#
#       </div>
#     </div>
#
#                 </dd>
#                 <dd>
#                         <i class="board-index board-index-2">2</i>
#     <a href="/films/1211269" title="姜子牙" class="image-link" data-act="boarditem-click" data-val="{movieId:1211269}">
#       <img src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/image/loading_2.e3d934bf.png" alt="" class="poster-default" />
#       <img data-src="https://p0.meituan.net/movie/3dc44919f0917b5823c867813f29fba42625689.jpg@160w_220h_1e_1c" alt="姜子牙" class="board-img" />
#     </a>
#     <div class="board-item-main">
#       <div class="board-item-content">
#               <div class="movie-item-info">
#         <p class="name"><a href="/films/1211269" title="姜子牙" data-act="boarditem-click" data-val="{movieId:1211269}">姜子牙</a></p>
# <p class="star">主演:郑希,杨凝,图特哈蒙</p><p class="releasetime">上映时间:2020</p>    </div>
#     <div class="movie-item-number wish">
#         <p class="month-wish">本月新增想看:<span><span class="stonefont">&#xf737;&#xf0bd;&#xf737;&#xecfb;&#xf0bd;</span></span>人</p>
#         <p class="total-wish">总想看:<span><span class="stonefont">&#xe388;&#xf0bd;&#xf495;&#xf495;&#xe388;&#xf5a5;</span></span>人</p>
#     </div>
#
#       </div>
#     </div>
#
#                 </dd>
#                 <dd>
#                         <i class="board-index board-index-3">3</i>
#     <a href="/films/1298859" title="木兰:横空出世" class="image-link" data-act="boarditem-click" data-val="{movieId:1298859}">
#       <img src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/image/loading_2.e3d934bf.png" alt="" class="poster-default" />
#       <img data-src="https://p1.meituan.net/moviemachine/3b8ab40ddd5e499a6c2924f7c34b03702316978.jpg@160w_220h_1e_1c" alt="木兰:横空出世" class="board-img" />
#     </a>
#     <div class="board-item-main">
#       <div class="board-item-content">
#               <div class="movie-item-info">
#         <p class="name"><a href="/films/1298859" title="木兰:横空出世" data-act="boarditem-click" data-val="{movieId:1298859}">木兰:横空出世</a></p>
# <p class="releasetime">上映时间:2020-04</p>    </div>
#     <div class="movie-item-number wish">
#         <p class="month-wish">本月新增想看:<span><span class="stonefont">&#xf2f8;&#xecfb;&#xf5a5;&#xe7b5;&#xf2f8;</span></span>人</p>
#         <p class="total-wish">总想看:<span><span class="stonefont">&#xf0bd;&#xf2f8;&#xf5a5;&#xf0bd;&#xf495;</span></span>人</p>
#     </div>
#
#       </div>
#     </div>
#
#                 </dd>
#                 <dd>
#                         <i class="board-index board-index-4">4</i>
#     <a href="/films/248585" title="西游记真假美猴王" class="image-link" data-act="boarditem-click" data-val="{movieId:248585}">
#       <img src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/image/loading_2.e3d934bf.png" alt="" class="poster-default" />
#       <img data-src="https://p0.meituan.net/movie/88dff6a7d97c6a91a125cece93ba72411017894.jpg@160w_220h_1e_1c" alt="西游记真假美猴王" class="board-img" />
#     </a>
#     <div class="board-item-main">
#       <div class="board-item-content">
#               <div class="movie-item-info">
#         <p class="name"><a href="/films/248585" title="西游记真假美猴王" data-act="boarditem-click" data-val="{movieId:248585}">西游记真假美猴王</a></p>
# <p class="star">主演:六小龄童,马德华</p><p class="releasetime">上映时间:2020</p>    </div>
#     <div class="movie-item-number wish">
#         <p class="month-wish">本月新增想看:<span><span class="stonefont">&#xe388;&#xf737;&#xf495;&#xe7b5;</span></span>人</p>
#         <p class="total-wish">总想看:<span><span class="stonefont">&#xe7b5;&#xf737;&#xe388;&#xf2f8;&#xf0bd;&#xf737;</span></span>人</p>
#     </div>
#
#       </div>
#     </div>
#
#                 </dd>
#                 <dd>
#                         <i class="board-index board-index-5">5</i>
#     <a href="/films/461076" title="紧急救援" class="image-link" data-act="boarditem-click" data-val="{movieId:461076}">
#       <img src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/image/loading_2.e3d934bf.png" alt="" class="poster-default" />
#       <img data-src="https://p1.meituan.net/moviemachine/8c38e239e10bc0b3db738b563dbb64a91093960.png@160w_220h_1e_1c" alt="紧急救援" class="board-img" />
#     </a>
#     <div class="board-item-main">
#       <div class="board-item-content">
#               <div class="movie-item-info">
#         <p class="name"><a href="/films/461076" title="紧急救援" data-act="boarditem-click" data-val="{movieId:461076}">紧急救援</a></p>
# <p class="star">主演:彭于晏,王彦霖,辛芷蕾</p><p class="releasetime">上映时间:2020</p>    </div>
#     <div class="movie-item-number wish">
#         <p class="month-wish">本月新增想看:<span><span class="stonefont">&#xf495;&#xe541;&#xf495;&#xe388;</span></span>人</p>
#         <p class="total-wish">总想看:<span><span class="stonefont">&#xe541;&#xf737;&#xf0bd;&#xf495;&#xf737;&#xec19;</span></span>人</p>
#     </div>
#
#       </div>
#     </div>
#
#                 </dd>
#                 <dd>
#                         <i class="board-index board-index-6">6</i>
#     <a href="/films/1218142" title="拆弹专家2" class="image-link" data-act="boarditem-click" data-val="{movieId:1218142}">
#       <img src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/image/loading_2.e3d934bf.png" alt="" class="poster-default" />
#       <img data-src="https://p0.meituan.net/movie/ccea56ae49249d482be5997aa98e94691344390.jpg@160w_220h_1e_1c" alt="拆弹专家2" class="board-img" />
#     </a>
#     <div class="board-item-main">
#       <div class="board-item-content">
#               <div class="movie-item-info">
#         <p class="name"><a href="/films/1218142" title="拆弹专家2" data-act="boarditem-click" data-val="{movieId:1218142}">拆弹专家2</a></p>
# <p class="star">主演:刘德华,刘青云,倪妮</p><p class="releasetime">上映时间:2020-07</p>    </div>
#     <div class="movie-item-number wish">
#         <p class="month-wish">本月新增想看:<span><span class="stonefont">&#xec19;&#xf5a5;&#xf737;&#xf0bd;</span></span>人</p>
#         <p class="total-wish">总想看:<span><span class="stonefont">&#xe7b5;&#xf737;&#xf737;&#xe388;&#xf737;</span></span>人</p>
#     </div>
#
#       </div>
#     </div>
#
#                 </dd>
#                 <dd>
#                         <i class="board-index board-index-7">7</i>
#     <a href="/films/1217123" title="夺冠" class="image-link" data-act="boarditem-click" data-val="{movieId:1217123}">
#       <img src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/image/loading_2.e3d934bf.png" alt="" class="poster-default" />
#       <img data-src="https://p0.meituan.net/moviemachine/6d175dd08812a4ff3f048f7b7e4ed449405105.jpg@160w_220h_1e_1c" alt="夺冠" class="board-img" />
#     </a>
#     <div class="board-item-main">
#       <div class="board-item-content">
#               <div class="movie-item-info">
#         <p class="name"><a href="/films/1217123" title="夺冠" data-act="boarditem-click" data-val="{movieId:1217123}">夺冠</a></p>
# <p class="star">主演:巩俐,黄渤,吴刚</p><p class="releasetime">上映时间:2020</p>    </div>
#     <div class="movie-item-number wish">
#         <p class="month-wish">本月新增想看:<span><span class="stonefont">&#xf0bd;&#xe541;&#xf495;&#xf495;</span></span>人</p>
#         <p class="total-wish">总想看:<span><span class="stonefont">&#xe7b5;&#xec19;&#xecfb;&#xecfb;&#xe541;&#xe388;</span></span>人</p>
#     </div>
#
#       </div>
#     </div>
#
#                 </dd>
#                 <dd>
#                         <i class="board-index board-index-8">8</i>
#     <a href="/films/344450" title="爵迹2" class="image-link" data-act="boarditem-click" data-val="{movieId:344450}">
#       <img src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/image/loading_2.e3d934bf.png" alt="" class="poster-default" />
#       <img data-src="https://p1.meituan.net/movie/eed97cfe2718d28d688615830856d07b226733.jpg@160w_220h_1e_1c" alt="爵迹2" class="board-img" />
#     </a>
#     <div class="board-item-main">
#       <div class="board-item-content">
#               <div class="movie-item-info">
#         <p class="name"><a href="/films/344450" title="爵迹2" data-act="boarditem-click" data-val="{movieId:344450}">爵迹2</a></p>
# <p class="star">主演:范冰冰,吴亦凡,陈学冬</p><p class="releasetime">上映时间:2020</p>    </div>
#     <div class="movie-item-number wish">
#         <p class="month-wish">本月新增想看:<span><span class="stonefont">&#xe541;&#xf5a5;&#xf737;&#xe541;</span></span>人</p>
#         <p class="total-wish">总想看:<span><span class="stonefont">&#xe7b5;&#xecfb;&#xf5a5;&#xe541;&#xf2f8;&#xe7b5;</span></span>人</p>
#     </div>
#
#       </div>
#     </div>
#
#                 </dd>
#                 <dd>
#                         <i class="board-index board-index-9">9</i>
#     <a href="/films/1216053" title="急先锋" class="image-link" data-act="boarditem-click" data-val="{movieId:1216053}">
#       <img src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/image/loading_2.e3d934bf.png" alt="" class="poster-default" />
#       <img data-src="https://p0.meituan.net/movie/2d4835513a81f6121189e1c3800eb1a3647939.jpg@160w_220h_1e_1c" alt="急先锋" class="board-img" />
#     </a>
#     <div class="board-item-main">
#       <div class="board-item-content">
#               <div class="movie-item-info">
#         <p class="name"><a href="/films/1216053" title="急先锋" data-act="boarditem-click" data-val="{movieId:1216053}">急先锋</a></p>
# <p class="star">主演:成龙,杨洋,艾伦</p><p class="releasetime">上映时间:2020</p>    </div>
#     <div class="movie-item-number wish">
#         <p class="month-wish">本月新增想看:<span><span class="stonefont">&#xe541;&#xec19;&#xf0bd;&#xe7b5;</span></span>人</p>
#         <p class="total-wish">总想看:<span><span class="stonefont">&#xf2f8;&#xf495;&#xf5a5;&#xf2f8;&#xf5a5;&#xec19;</span></span>人</p>
#     </div>
#
#       </div>
#     </div>
#
#                 </dd>
#                 <dd>
#                         <i class="board-index board-index-10">10</i>
#     <a href="/films/1230199" title="小妇人" class="image-link" data-act="boarditem-click" data-val="{movieId:1230199}">
#       <img src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/image/loading_2.e3d934bf.png" alt="" class="poster-default" />
#       <img data-src="https://p0.meituan.net/movie/f09d25b5dc2d0080fa93db9ffaaf4c5a6389423.jpg@160w_220h_1e_1c" alt="小妇人" class="board-img" />
#     </a>
#     <div class="board-item-main">
#       <div class="board-item-content">
#               <div class="movie-item-info">
#         <p class="name"><a href="/films/1230199" title="小妇人" data-act="boarditem-click" data-val="{movieId:1230199}">小妇人</a></p>
# <p class="star">主演:西尔莎·罗南,艾玛·沃特森,佛罗伦斯·珀</p><p class="releasetime">上映时间:2020</p>    </div>
#     <div class="movie-item-number wish">
#         <p class="month-wish">本月新增想看:<span><span class="stonefont">&#xe541;&#xec19;&#xecfb;&#xf5a5;</span></span>人</p>
#         <p class="total-wish">总想看:<span><span class="stonefont">&#xf0bd;&#xf495;&#xec19;&#xec19;&#xe541;</span></span>人</p>
#     </div>
#
#       </div>
#     </div>
#
#                 </dd>
#             </dl>
#
#         </div>
#             <div class="pager-main">
#
#
#   <ul class="list-pager">
#
#
#
#     <li class="active">
#     <a class="page_1"
#       href="javascript:void(0);" style="cursor: default"
#   >1</a>
#
# </li>
#   <li >
#     <a class="page_2"
#       href="?offset=10"
#   >2</a>
#
# </li>
#   <li >
#     <a class="page_3"
#       href="?offset=20"
#   >3</a>
#
# </li>
#   <li >
#     <a class="page_4"
#       href="?offset=30"
#   >4</a>
#
# </li>
#   <li >
#     <a class="page_5"
#       href="?offset=40"
#   >5</a>
#
# </li>
#
#
# <li>  <a class="page_2"
#       href="?offset=10"
#   >下一页</a>
# </li>
# </ul>
#
#
#             </div>
#     </div>
# </div>
#
#     </div>
#
# <div class="footer">
#   <p class="friendly-links">
#     关于猫眼 :
#     <a href="http://ir.maoyan.com/s/index.php#pageScroll0" target="_blank">关于我们</a>
#     <span></span>
#     <a href="http://ir.maoyan.com/s/index.php#pageScroll1" target="_blank">管理团队</a>
#     <span></span>
#     <a href="http://ir.maoyan.com/s/index.php#pageScroll2" target="_blank">投资者关系</a>
#     &nbsp;&nbsp;&nbsp;&nbsp;
#     友情链接 :
#     <a href="http://www.meituan.com" data-query="utm_source=wwwmaoyan" target="_blank">美团网</a>
#     <span></span>
#     <a href="http://www.gewara.com" data-query="utm_source=wwwmaoyan">格瓦拉</a>
#     <span></span>
#     <a href="http://i.meituan.com/client" data-query="utm_source=wwwmaoyan" target="_blank">美团下载</a>
#     <span></span>
#     <a href="https://www.huanxi.com" data-query="utm_source=maoyan_pc" target="_blank">欢喜首映</a>
#   </p>
#   <p class="friendly-links">
#     商务合作邮箱:v@maoyan.com
#     客服电话:10105335
#     违法和不良信息举报电话:4006018900
#   </p>
#   <p class="friendly-links">
#     用户投诉邮箱:tousujubao@meituan.com
#     舞弊线索举报邮箱:wubijubao@maoyan.com
#   </p>
#   <p class="friendly-links  credentials">
#     <a href="/about/licence/1" target="_blank">中华人民共和国增值电信业务经营许可证 京B2-20190350</a>
#     <span></span>
#     <a href="/about/licence/4" target="_blank">营业性演出许可证 京演(机构)(2019)4094号</a>
#   </p>
#   <p class="friendly-links  credentials">
#     <a href="/about/licence/3" target="_blank">广播电视节目制作经营许可证 (京)字第08478号</a>
#     <span></span>
#     <a href="/about/licence/2" target="_blank">网络文化经营许可证 京网文(2019)3837-369号 </a>
#   </p>
#   <p class="friendly-links  credentials">
#     <a href="/rules/agreement" target="_blank">猫眼用户服务协议 </a>
#     <span></span>
#     <a href="/rules/rule" target="_blank">猫眼平台交易规则总则 </a>
#     <span></span>
#     <a href="/rules/privacy" target="_blank">隐私政策 </a>
#   </p>
#   <p class="friendly-links  credentials">
#     <a href="http://www.beian.gov.cn/portal/registerSystemInfo?recordcode=11010102003232" target="_blank">京公网安备
#       11010102003232号</a>
#     <span></span>
#     <a href="http://www.beian.miit.gov.cn/" target="_blank">京ICP备16022489号</a>
#   </p>
#   <p>北京猫眼文化传媒有限公司</p>
#   <p>
#     &copy;<span class="my-footer-year">2016</span>
#     猫眼电影 maoyan.com</p>
#   <div class="certificate">
#     <a href="http://sq.ccm.gov.cn:80/ccnt/sczr/service/business/emark/toDetail/350CF8BCA8416C4FE0530140A8C0957E"
#       target="_blank">
#       <img src="http://p0.meituan.net/moviemachine/e54374ccf134d1f7b2c5b075a74fca525326.png" />
#     </a>
#     <a href="/about/licence/5" target="_blank">
#       <img src="http://p1.meituan.net/moviemachine/805f605d5cf1b1a02a4e3a5e29df003b8376.png" />
#     </a>
#   </div>
# </div>
#
#     <script crossorigin="anonymous" src="//www.dpfile.com/app/owl/static/owl_1.7.11.js"></script>
#     <script>
#       Owl.start({
#         project: 'com.sankuai.movie.fe.mywww',
#         pageUrl: location.href.split('?')[0].replace(/\/\d+/g, '/:id'),
#         devMode: false
#       })
#     </script>
#     <script src="//s0.meituan.net/bs/?f=myfe/canary:mojo-0.1.2.js"></script>
#     <script>
#       MAInit({
#         appkey: 'com.sankuai.movie.fe.mywww',
#         app_name: 'maoyan-pc-web',
#         app_version: '1.0.0',
#       })
#     </script>
#     <!--[if IE 8]><script crossorigin="anonymous" src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/es5-shim.bbad933f.js"></script><![endif]-->
#     <!--[if IE 8]><script crossorigin="anonymous" src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/es5-sham.d6ea26f4.js"></script><![endif]-->
#     <script crossorigin="anonymous" src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/common.3c7410fb.js"></script>
# <script crossorigin="anonymous" src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/board-index.e144d497.js"></script>
# </body>
# </html>
#
# """
# dict = {'&#xf5a5;': '9', '&#xe388;': '8', '&#xf2f8;': '1', '&#xe7b5;': '3', '&#xf737;': '2', '&#xf495;': '7',
#         '&#xecfb;': '0', '&#xf0bd;': '5', '&#xe541;': '4', '&#xec19;': '6'}
#
# utf_list = [eval(r"'\u" + x[3:7] + "'") for x in dict.keys()]  # 转换格式
# utf_dict = {i: j for i, j in zip(utf_list, dict.values())}
#
# for i in re.findall(r'(&#x[\w]{4};)', html):
#     html = html.replace(i, dict[i])
#
# aa = etree.HTML(html)
# dds = aa.xpath('//dl[@class="board-wrapper"]/dd')
# for dd in dds:
#     name = dd.xpath('.//div[@class="movie-item-info"]/p[@class="name"]/a/text()')[0]
#     star = dd.xpath('.//div[@class="movie-item-info"]/p[@class="star"]/text()')
#     star = '暂无' if star == [] else star[0].replace('主演:', '')
#     releasetime = dd.xpath('.//div[@class="movie-item-info"]/p[@class="releasetime"]/text()')[0].replace('上映时间:', '')
#     new_see = ''.join(dd.xpath('.//div[@class="movie-item-number wish"]/p[1]//text()')).replace('本月新增想看:', '').replace(
#         '人', '')
#     # for i in utf_dict.keys():
#     #     new_see = new_see.replace(i,utf_dict[i])
#     all_see = ''.join(dd.xpath('.//div[@class="movie-item-number wish"]/p[2]//text()')).replace('总想看:', '').replace('人',
#                                                                                                                     '')
#     # for i in utf_dict.keys():
#     #     all_see = all_see.replace(i,utf_dict[i])
#     print(name, star, releasetime)
#     print(new_see, all_see)


  • 3
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值