python语言中整型对应的英文名_python实现给定电影英文名，在猫眼上爬到中文名和票房...

最新推荐文章于 2021-01-29 05:45:20 发布

weixin_39977488

最新推荐文章于 2021-01-29 05:45:20 发布

阅读量665

点赞数

文章标签： python语言中整型对应的英文名

&[root@xxn maoyan]# cat cat.py

#!/usr/bin/env python

#coding:utf-8

import requests

from bs4 import BeautifulSoup

def movieurl(url):

"""

用来获取电影的单页url地址

"""

headers = {

"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.3964.2 Safari/537.36",

}

response = requests.get(url,headers=headers,timeout=10)

soup= BeautifulSoup(response.text,'lxml')

href = soup.find_all('div',class_="channel-detail movie-item-title")[0]

movieurl = "http://maoyan.com%s" % href.find('a')['href']

return movieurl

def moveinfo(url):

"""

得到电影的中文名,票房单位。

如果票房单位没数据，说明票房"暂无"。

"""

headers = {

"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.3964.2 Safari/537.36",

}

response = requests.get(url, headers=headers,timeout=5)

soup = BeautifulSoup(response.text, 'lxml')

Chinesename = soup.find('div',class_="movie-brief-container").h3.string

try:

boxofficeunit = soup.find_all('div',class_="movie-index-content box")[0].find('span',class_='unit').string

except:

boxofficeunit = 0

return Chinesename,boxofficeunit

if __name__ == '__main__':

Moviename = input("请输入电影的英文名字：")

Moviename = Moviename.replace(' ','+')

url = "http://maoyan.com/query?kw=%s&type=0" % Moviename

Chinesename, boxofficeunit = moveinfo(movieurl(url))

print Chinesename,boxofficeunit

&[root@xxn maoyan]# cat maoyan.py

#!/usr/bin/env python

# coding=utf-8

from selenium import webdriver

from selenium.webdriver.common.keys import Keys

from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

import random

from PIL import Image

import pytesseract

import os

import cat

def imagedownlod(url):

"""

把电影单页做个截图保存,因为我们要取票房数据，所以不进行图片载入，加快速度

"""

dcap = dict(DesiredCapabilities.PHANTOMJS)

USER_AGENTS=[

'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.4882.400 QQBrowser/9.7.13059.400'

]

#从USER_AGENTS列表中随机选一个浏览器头，伪装浏览器

dcap["phantomjs.page.settings.userAgent"] = (random.choice(USER_AGENTS))

driver = webdriver.PhantomJS(desired_capabilities=dcap)

# 不载入图片，爬页面速度会快很多

dcap["phantomjs.page.settings.loadImages"] = False # 禁止加载图片

driver = webdriver.PhantomJS(desired_capabilities=dcap)

driver.set_window_size(1366, 3245)

driver.get(url)

driver.save_screenshot("maoyan.png")

def crop_image(image_path,crop_path):

"""

本来想利用webdriver来得到票房元素的位置,然后根据位置和元素大小做数字运算求出来4个参数，位置可以正常得到，但是图片大小不一，所以抠图会有问题

所以换种方式：我把每个页面截图修改成统一大小，因为票房的位置是固定的，所以这样可以使爬虫更强健。

"""

# 计算抠取区域的绝对坐标

left = 668

top = 388

right = 668+158

bottom = 388+54

# 打开图片，抠取相应区域并存储

img = Image.open(image_path)

out = img.resize((1366, 3245),Image.ANTIALIAS) #resize image with high-quality

out.save('maoyannew.png')

im = Image.open('maoyannew.png')

im = im.crop((left, top, right, bottom))

im.save(crop_path)

os.remove('maoyannew.png')

def words(image):

"""

因为我们对不同大小的图片进行归一化处理，所以有些图片pytesseract不能识别数字

所以我首先进行灰度处理，然后使用config="-psm 8 -c tessedit_char_whitelist=1234567890"这个参数

"""

im = Image.open(image).convert('L')

im.save(image)

number = pytesseract.image_to_string(Image.open(image),config="-psm 8 -c tessedit_char_whitelist=1234567890")

os.remove(image)

return number

if __name__ == '__main__':

Moviename = input("请输入电影的英文名字：")

Moviename = Moviename.replace(' ','+')

url = "http://maoyan.com/query?kw=%s&type=0" % Moviename

Chinesename,boxofficeunit = cat.moveinfo(cat.movieurl(url))

imagedownlod(cat.movieurl(url))

crop_image('maoyan.png','piaofang.png')

print words('piaofang.png')

os.remove('maoyan.png')

&[root@xxn maoyan]# cat catseye.py

#!/usr/bin/env python

# coding=utf-8

import cat

import maoyan

import sys

import os

reload(sys)

sys.setdefaultencoding('utf8')

def main():

moviename = input("请输入电影的英文名字：")

Moviename = moviename.replace(' ','+')

Moviename = moviename.replace(':','%3A')

url = "http://maoyan.com/query?kw=%s&type=0" % Moviename

Chinesename,boxofficeunit = cat.moveinfo(cat.movieurl(url))

if boxofficeunit == 0:

"""

如果票房单位为0也就是不存在，那么电影票房也就是暂无，所以我们就不需要抠图识别数字了

"""

print "您搜索的电影英文名字:" + moviename

print "您搜索的电影中文名字:" + Chinesename

print "你搜索的电影票房:" + '暂无'

else:

maoyan.imagedownlod(cat.movieurl(url))

maoyan.crop_image('maoyan.png','piaofang.png')

number = maoyan.words('piaofang.png')

print "您搜索的电影英文名字:" + moviename

print "您搜索的电影中文名字:" + Chinesename

print "你搜索的电影票房:" + str(number2) + str(boxofficeunit)

os.remove('maoyan.png')

if __name__ == '__main__':

main()

测试：

weixin_39977488

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python语言中整型对应的英文名_python实现给定电影英文名，在猫眼上爬到中文名和票房...

&[root@xxnmaoyan]#catcat.py#!/usr/bin/envpython#coding:utf-8importrequestsfrombs4importBeautifulSoupdefmovieurl(url):"""用来获取电影的单页url地址"""headers={"User-Agent":"Mozilla/5.0(WindowsNT6...
复制链接

扫一扫