python3.6爬虫库_反爬虫1-python3.6抓取猫眼电影信息

最新推荐文章于 2024-04-25 17:46:12 发布

weixin_39598501

最新推荐文章于 2024-04-25 17:46:12 发布

阅读量338

点赞数

文章标签： python3.6爬虫库

本文链接：https://blog.csdn.net/weixin_39598501/article/details/111497907

版权

#完整代码

import requests

import re

from fontTools.ttLib import TTFont

from bs4 import BeautifulSoup as bs

from lxml import html

from fontTools.ttLib import TTFont

# 抓取maoyan票房

class MaoyanSpider:

# 页面初始化

def __init__(self):

self.headers = {

"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",

"Accept-Encoding": "gzip, deflate, br",

"Accept-Language": "zh-CN,zh;q=0.8",

"Cache-Control": "max-age=0",

"Connection": "keep-alive",

"Upgrade-Insecure-Requests": "1",

"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36"

}

# 获取票房

def getNote(self):

url = 'http://maoyan.com/cinema/24311?poi=164257570'

host = {'host': 'maoyan.com',

'refer': 'http://maoyan.com/news'}

# 合并字典

headers={**self.headers,**host}

#headers = dict(self.headers.items() + host.items())在python3中会报错

# 获取页面内容

r = requests.get(url, headers=headers)

# print r.text

response = html.fromstring(r.text)

u = r.text

# 匹配ttf font

cmp = re.compile(",\nurl\('(//.*.woff)'\) format\('woff'\)")

rst = cmp.findall(u)

ttf = requests.get("http:" + rst[0], stream=True)

with open("maoyanprice.woff", "wb") as pdf:

for chunk in ttf.iter_content(chunk_size=1024):

if chunk:

pdf.write(chunk)

# 解析字体库font文件

#baseprice.woff是自己在网页上下载的乱码字符

baseFont = TTFont('C:\\Users\\nanafighting\\Desktop\\baseprice.woff')

maoyanFont = TTFont('maoprice.woff')

maoyan_unicode_list = maoyanFont['cmap'].tables[0].ttFont.getGlyphOrder()

maoyan_num_list = []

baseNumList = ['.', '6', '4', '7', '5', '2', '8', '0', '1', '9', '3']

baseUniCode = ['x', 'uniF76E', 'uniEACB', 'uniE8D1', 'uniE737', 'uniE9B7', 'uniF098', 'uniF4DC', 'uniF85E','uniE2F1', 'uniEE4E']

for i in range(1, 12):

maoyanGlyph = maoyanFont['glyf'][maoyan_unicode_list[i]]

for j in range(11):

baseGlyph = baseFont['glyf'][baseUniCode[j]]

if maoyanGlyph == baseGlyph:

maoyan_num_list.append(baseNumList[j])

break

maoyan_unicode_list[1] = 'uni0078'

utf8List = [eval(r"'\u" + uni[3:] + "'").encode("utf-8") for uni in maoyan_unicode_list[1:]]

# 获取发帖内容

soup = bs(u, "html.parser")

index = soup.find_all('div', {'class': 'show-list'})

print('---------------Prices-----------------')

for n in range(len(index)):

mn = soup.find_all('h3', {'class': 'movie-name'})

ting = soup.find_all('span', {'class': 'hall'})

mt = soup.find_all('span', {'class': 'begin-time'})

mw = soup.find_all('span', {'class': 'stonefont'})

for i in range(len(mn)):

moviename = mn[i].get_text()

film_ting = ting[i].get_text()

movietime = mt[i].get_text()

moviewish = mw[i].get_text().encode('utf-8')

#字符串转换

#moviewish = str(moviewish, encoding='utf-8')

#moviewish = '%r' % moviewish

#moviewish = moviewish[1:-1]

moviewish=''.join('%s' %id for id in moviewish)

for i in range(len(utf8List)):

#字符转换

utf8List[i]=''.join('%s' %id for id in utf8List[i])

maoyan_num_list[i]=''.join('%s' %id for id in maoyan_num_list[i])

moviewish = moviewish.replace(utf8List[i], maoyan_num_list[i])

print(moviename, film_ting, movietime, moviewish)

spider = MaoyanSpider()

print(spider.getNote())

运行结果：

思路分解：

1.页面信息

查看信息发现价格存在乱码现象：

刷新页面找到乱码的URL，下载woff格式文件：方法：复制URL：右键单击转到下载完成，即为代码中的baseprice.woff文件，再次刷新网页，同样的方法再次下载URL作为匹配的woff文件，即为代码中的maoprice.woff.

用这个网址打开保存的base.woff文件，如下图：FontEditorfontstore.baidu.com

与代码行对应：

反爬虫字体解析原理：先在网页上下载乱码文件base.woff，可以转化为xml,用pycharm打开可以看到信息，再刷新页面后下载maoyan.woff文件可以看到二者有对应的关系，就可以编写代码。

二者的对应关系：

2.字体解析代码：