主要代码及分析:
# -*- coding: utf-8 -*-
"""
Created on Thu Jul 12 11:12:16 2018
利用正则表达式爬虫猫眼top100
@author: Administrator
"""
import requests
from requests.exceptions import RequestException
import re
import json
from multiprocessing import Pool
import hashlib
import os
# 1、获取一页信息
def get_one_page(url):
try:
#headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'}
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'}
response = requests.get(url,headers=headers) #利用requests解析url得到网页信息
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
# 2、解析一页信息
def parse_one_page(html):
#pattern = re.compile('<dd>.*?board-index.*?>(.*?)</i>.*?data-src="(.*?)".*?name"><a.*?>'+'(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>.*?"</p>'+'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>',re.S)
pattern = re.compile('<dd>.*?board-index.*?>(.*?)</i>.*?data-src="(.*?)".*?name.*?a.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)</i.*?fraction">(.*?)</i>.*?</dd>', re.S) #分析网页原代码,利用正则表达式编写需要的信息模式
#pattern = re.compile('<dd>.*?board-index.*?>(.*?)</i>.*?data-src="(.*?)">.*?</a>.*?name">.*?a.*?>(.*?)</a>.*?)star">(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>',re.S)
#pattern = re.compile('<dd>.*?board-index.*?>(.*?)</i>.*?data-src="(.*?)".*?name.*?a.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)</i.*?fraction">(.*?)</i>.*?</dd>', re.S)
items = re.findall(pattern, html) #
for item in items:
#生成器yield
yield {
'index': item[0],
'image': item[1],
'title': item[2],
'actors': item[3].strip()[3:],
'time': item[4].strip()[5:],
'socre': item[5]+item[6]
}
#print(items)
# 3、写入文件
def wirte_to_file(content):
with open('result.txt', 'a',encoding='utf-8') as f:
f.write(json.dumps(content,ensure_ascii=False) + '\n') #.write() 对象必须是str,not dict,所以要用到json.dumps转换为json数据对象存储
f.close()
def main(offset):
url = 'http://maoyan.com/board/4?offset=' +str(offset)
html = get_one_page(url)
#print(html)
#parse_one_page(html)
for item in parse_one_page(html):
#print(item)
wirte_to_file(item)
#==============================================================================
# r=requests.get('http://p3.pstatp.com/origin/pgc-image/15317459528784e9ce3883e')
# print(r.content)
#==============================================================================
if __name__ == '__main__':
for offset in range(10): #根据网页翻页信息确定是offset变换
main(offset*10)
#break
pool = Pool() #引入进程池
pool.map(main,[offset*10 for offset in range(10)])