抓取的原理也比较简单,不过多解释了,代码注释的也比较清楚
参考: Python网络爬虫实战(第二版)
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 16 14:20:20 2020
@author: hja
"""
from bs4 import BeautifulSoup
import urllib.request
import codecs
from mylog import MyLog as mylog
import sys
import re
class MovieItem(object):
movieName = None
movieScore = None
movieStarring = None
class GetMovie(object):
#'''获取电影信息'''
def __init__(self):
self.urlBase = r'http://dianying.2345.com/list/----2019---1.html'
self.log = mylog()
self.pages = self.getPages()
self.urls = []#url池
self.items = []
self.getUrls(self.pages) #获取抓取页面的url
self.spider(self.urls)
self.pipelines(self.items)
def getPages(self):
'''获取总页数'''
self.log.info('开始获取页面')
htmlContent = self.getResponseContent(self.urlBase)
soup = BeautifulSoup(htmlContent,'lxml')
tag = soup.find('div',attrs = {'class':'v_page'})
subTags = tag.find_all('a',attrs = {'target':'_self'})
self.log.info('获取网页成功')
return int(subTags[-2].get_text())
def getResponseContent(self,url):
'''获取页面返回的数据'''
fakeHeaders = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'}
request = urllib.request.Request(url,headers = fakeHeaders)
try:
response = urllib.request.urlopen(request)
except:
self.log.error('Python返回url : %s 数据失败' %url)
else:
self.log.info('Python 返回URUL :%s 数据成功' %url)
return response.read().decode('GBK')
def getUrls(self,pages):
urlHead = 'http://dianying.2345.com/list/----2019---'
urlEnd = '.html'
for i in range(1,pages + 1):#pages + 1
url = urlHead + str(i) + urlEnd
self.urls.append(url)
self.log.info('添加URL:%s 到 URLS列表' %url)
def spider(self,urls):
for url in urls:
htmlContent = self.getResponseContent(url)
soup = BeautifulSoup(htmlContent,'lxml')
#查看网页源码,电影名字,分数主演,都在ul 后面,ul后面的class = v_picTxt pic180_240 clearfix
anchorTag = soup.find('ul',attrs = {'class':'v_picTxt pic180_240 clearfix'})
#li media=204546
tags = anchorTag.find_all('li',attrs = {'media':re.compile('\d{5}')})
for tag in tags:
item = MovieItem()
#span class=sTit
item.movieName = tag.find('span',attrs = {'class':'sTit'}).get_text()
#span class=pRightBottomem9.3分emspan
item.movieScore = tag.find('span',attrs = {'class':'pRightBottom'}).em.get_text().replace('分','')
#span class=sDes主演:
item.movieStarring = tag.find('span',attrs = {'class':'sDes'}).get_text().replace('主演','')
self.items.append(item)
self.log.info('获取电影名字为: <<%s>> 成功' %(item.movieName))
def pipelines(self,items):
fileName = '2019热门电影.txt'
with codecs.open(fileName,'w','utf-8') as fp:
for item in items:
# fp.write('%s \t %s \t %s \r\n' %(item.movieName,item.movieScore,item.movieStarring))
# 排版
mess = item.movieName + item.movieStarring + item.movieScore
new_mess = "".join((re.sub("\n", " ", mess)).split(" "))
fp.write('%s\n \n'%(new_mess))
self.log.info('电影名字为 :<<%s>>已成功存入文件"%s"...' %(item.movieName,fileName))
if __name__ == '__main__':
GM = GetMovie()
我稍稍修改了一下打印在TXT的排版,效果如下
ps2021年5月28日18:00:27补充
mylog.py
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 16 14:03:03 2020
@author: asus
"""
import logging
import getpass
import sys
###定义Mylog类
class MyLog(object):
### 类MyLog的构造函数
def __init__(self):
self.user = getpass.getuser()
self.logger = logging.getLogger(self.user)
self.logger.setLevel(logging.DEBUG)
#日志文件名
self.logFile = sys.argv[0][0:-3] + '.log'
self.formatter = logging.Formatter('%(asctime) - 12s %(levelname) - 8s %(name) -10s %(message) -12s\r\n')
#日志显示到屏幕上并输出到日志文件内
self.logHand = logging.FileHandler(self.logFile,encoding = 'utf-8')
self.logHand.setFormatter(self.formatter)
self.logHand.setLevel(logging.DEBUG)
self.logHandSt = logging.StreamHandler()
self.logHandSt.setFormatter(self.formatter)
self.logHandSt.setLevel(logging.DEBUG)
self.logger.addHandler(self.logHand)
self.logger.addHandler(self.logHandSt)
###日志的五个级别对应以下5个函数
def debug(self,msg):
self.logger.debug(msg)
def info(self,msg):
self.logger.info(msg)
def warn(self,msg):
self.logger.warn(msg)
def error(self,msg):
self.logger.error(msg)
def critical(self,msg):
self.logger.critical(msg)
if __name__ == '__main__':
mylog = MyLog()
mylog.debug(u'I am debug 测试中文')
mylog.info(u'I am info ')
mylog.warn(u'I am warn')
mylog.error(u'I am error 测试中文')
mylog.critical(u'I am critical')