猫眼电影TOP100爬取

本次练习使用到的知识点
  • Requsts 库的基本使用
  • 正则表达式的使用
  • Python3写入CSV
1、项目流程分析
未命名文件.png

2、中心调度
# 主调度程序
def main ():
# 起始URL
start_url = 'http://maoyan.com/board/4'
for i in range( 0 , 100 , 10 ):
# 获取响应文本内容
html = get_one_page(url=start_url, offset=i)
if html is None :
print( '链接:%s?offset=%s异常' .format(start_url,i))
continue
for item in parse_one_page(html=html):
store_data(item)
download_thumb(item[ 'title' ],item[ 'thumb' ])
3、页面内容获取
# 请求一个页面返回响应内容
def get_one_page (url, offset):
try :
response = requests.get(url=url,params={ 'offset' :offset})
if response.status_code == 200 :
return response.text
else :
return None
except RequestException as e:
return None
4、页面解析
# 解析一个页面
def parse_one_page (html):
pattern = '<dd>.*?board-index.*?">(\d+)</i>.*?data-src="(.*?)".*?/>.*?movie-item-info.*?title="(.*?)".*?star">' +\
'(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)</i>.*?fraction">(\d+)</i>.*?</dd>'
# re.S 匹配多行
regex = re.compile(pattern,re.S)
items = regex.findall(html)
for item in items:
yield {
'index' :item[ 0 ],
'thumb' :get_large_thumb(item[ 1 ]),
'title' :item[ 2 ],
'actors' :item[ 3 ].strip()[ 3 :],
'release_time' :get_release_time(item[ 4 ].strip()[ 5 :]),
'area' :get_release_area(item[ 4 ].strip()[ 5 :]),
'score' :item[ 5 ]+item[ 6 ]
}

5、数据处理函数
# 获取上映时间
def get_release_time (data):
pattern = '^(.*?)(\(|$)'
regex = re.compile(pattern)
w = regex.search(data)
return w.group( 1 )


# 获取上映地区
def get_release_area (data):
pattern = '.*\((.*)\)'
regex = re.compile(pattern)
w = regex.search(data)
if w is None :
return ' 未知 '
return w.group( 1 )


# 获取封面大图
def get_large_thumb (url):
pattern = '(.*?)@.*?'
regex = re.compile(pattern)
w = regex.search(url)
return w.group( 1 )
6、数据存储

# 存储数据
def store_data (item):
with open ( 'movie.csv' , 'a' , newline = '' , encoding = 'utf-8' ) as data_csv:
# dialect 为打开 csv 文件的方式,默认是 excel delimiter="\t" 参数指写入的时候的分隔符
try :
csv_writer = csv.writer(data_csv)
csv_writer.writerow([item[ 'index' ], item[ 'thumb' ], item[ 'title' ], item[ 'actors' ],item[ 'release_time' ],item[ 'area' ],item[ 'score' ]])
except Exception as e:
print (e)
print (item)


# 下载封面图
def download_thumb (title,url):
try :
response = requests.get(url=url)
# 获取二进制数据
with open( 'thumb/' +title+ '.jpg' , 'wb' ) as f:
f.write(response.content)
f.close()
except RequestException as e:
print(e)
pass
7、完整运行代码
#!/usr/bin/python
# -*- coding: utf-8 -*-
import requests
import re
import csv
from requests.exceptions import RequestException


# 请求一个页面返回响应内容
def get_one_page (url, offset):
try :
response = requests.get( url =url, params ={ 'offset' :offset})
if response.status_code == 200 :
return response.text
else :
return None
except RequestException as e :
return None


# 解析一个页面
def parse_one_page (html):
pattern = '<dd>.*?board-index.*?">(\d+)</i>.*?data-src="(.*?)".*?/>.*?movie-item-info.*?title="(.*?)".*?star">' +\
'(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)</i>.*?fraction">(\d+)</i>.*?</dd>'
# re.S 匹配多行
regex = re.compile(pattern,re.S)
items = regex.findall(html)
for item in items:
yield {
'index' :item[ 0 ],
'thumb' :get_large_thumb(item[ 1 ]),
'title' :item[ 2 ],
'actors' :item[ 3 ].strip()[ 3 :],
'release_time' :get_release_time(item[ 4 ].strip()[ 5 :]),
'area' :get_release_area(item[ 4 ].strip()[ 5 :]),
'score' :item[ 5 ]+item[ 6 ]
}


# 获取上映时间
def get_release_time (data):
pattern = '^(.*?)(\(|$)'
regex = re.compile(pattern)
w = regex.search(data)
return w.group( 1 )


# 获取上映地区
def get_release_area (data):
pattern = '.*\((.*)\)'
regex = re.compile(pattern)
w = regex.search(data)
if w is None :
return ' 未知 '
return w.group( 1 )


# 获取封面大图
def get_large_thumb (url):
pattern = '(.*?)@.*?'
regex = re.compile(pattern)
w = regex.search(url)
return w.group( 1 )


# 存储数据
def store_data (item):
with open ( 'movie.csv' , 'a' , newline = '' , encoding = 'utf-8' ) as data_csv:
# dialect 为打开 csv 文件的方式,默认是 excel delimiter="\t" 参数指写入的时候的分隔符
try :
csv_writer = csv.writer(data_csv)
csv_writer.writerow([item[ 'index' ], item[ 'thumb' ], item[ 'title' ], item[ 'actors' ],item[ 'release_time' ],item[ 'area' ],item[ 'score' ]])
except Exception as e:
print (e)
print (item)


# 下载封面图
def download_thumb (title,url):
try :
response = requests.get( url =url)
# 获取二进制数据
with open ( 'thumb/' +title+ '.jpg' , 'wb' ) as f:
f.write(response.content)
f.close()
except RequestException as e:
print (e)
pass


# 主调度程序
def main ():
# 起始 URL
start_url = 'http://maoyan.com/board/4'
for i in range ( 0 , 100 , 10 ):
# 获取响应文本内容
html = get_one_page( url =start_url, offset =i)
if html is None :
print ( ' 链接 :%s?offset=%s 异常 ' .format(start_url,i))
continue
for item in parse_one_page( html =html):
store_data(item)
download_thumb(item[ 'title' ],item[ 'thumb' ])


if __name__== '__main__' :
main()
8、运行结果
Clipboard Image.png

Clipboard Image.png


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值