一个简单爬虫
1.目标:爬取2345电影网2017年最新电影
2.所使用的库:
from bs4 import BeautifulSoup
import requests
import codecs
测试环境 Python 3.6.0
3.目标链接
http://dianying.2345.com/list/—-2017—2.html
点击下一页观察每个url变化规律
4.开发者工具观察
所有电影内容都在{‘class’:’v_picConBox mt15’}这个div
里,div
里面是ul
,ul
下又包含所有电影内容的li
标签
(1)先写第一个函数,首先获取整个页面
def getHTMLText(self,url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
(2)然后计算所有页面页数,这时可以使用检查元素
def getPages(self):
html = self.getHTMLText(self.urlBase)
soup = BeautifulSoup(html,'lxml')
# tag = soup.find('div',attrs={'class':'v_picConBox mt15'})
tag = soup.find('div',attrs={'class':'v_page'})
subTags = tag.find_all('a')
#获取页码数
return int(subTags[-2].get_text())
(3)用于拼接url的函数
pages由上面的getPages()函数获得
def getUrls(self,pages):
urlHead = 'http://dianying.2345.com/list/----2017---'
urlEnd = '.html'
#pages由上面的getPages()函数获得,产生1-23之间数字
for i in range(1,pages+1):
url = urlHead + str(i) +urlEnd
self.urls.append(url)
(4)爬取页面函数,包含三项内容movieName/ movieScore/ movieStaring
这里之前定义了一个类
class MovieItem(object):
movieName = None
movieScore = None
movieStaring = None
def spider(self,urls):
#此时urls是23个拼接好的url地址
for url in urls:
#循环获取页面内容
htmlContent = self.getHTMLText(url)
soup = BeautifulSoup(htmlContent,'lxml')
anchorTag = soup.find('ul',attrs={'class':'v_picTxt pic180_240 clearfix'})
tags = anchorTag.find_all('li')
for tag in tags:
item = MovieItem()
item.movieName = tag.find('span',attrs ={'class':'sTit'}).getText()
item.movieScore = tag.find('span',attrs={'class':'pRightBottom'}).em.get_text().replace('分:','')
item.movieStaring = tag.find('span',attrs={'class':'sDes'}).get_text().replace('主演:','')
self.items.append(item)
/—————————————————————————————————-/
(5)最后一个函数save()
这里导入了codecs模块,这个模块可以选择输入字符的编码.之前的程序在写入txt时都需要将字符串的编码转换成utf8,这里只需要用
codecs.open(filename,'w','utf8')
打开文件就行了.后面往句柄中输入的字符串都会自动保存为utf8的编码.
def save(self,items):
count =0
fileName = '2017热门电影.txt'.encode('GBK')
#格式化,这里有两种实现方式
tplt = "{0:^10}\t{1:<10}\t{2:^10}"
#使用了之前导入的codesc库,修改编码
with codecs.open(fileName,'w','utf-8') as fp:
#items是已经存储好爬取内容的列表
for item in items:
# fp.write('%s \t %s \t %s \r\n' %(item.movieName,item.movieScore,item.movieStaring))
# tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}"
fp.write(tplt.format(item.movieName,item.movieScore,item.movieStaring))
count = count + 1
print('\r当前进度:{:.2f}%'.format(count*100/len(tags),end=''))
完整代码
from bs4 import BeautifulSoup
import requests
import codecs
class MovieItem(object):
movieName = None
movieScore = None
movieStaring = None
class GetMovie(object):
def __init__(self):
self.urlBase = 'http://dianying.2345.com/list/----2017--.html'
self.pages = self.getPages()
self.urls = [] #存放拼接后的url
self.items = []
self.getUrls(self.pages)
self.spider(self.urls)
self.save(self.items)
def getHTMLText(self,url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def getPages(self):
html = self.getHTMLText(self.urlBase)
soup = BeautifulSoup(html,'lxml')
# tag = soup.find('div',attrs={'class':'v_picConBox mt15'})
tag = soup.find('div',attrs={'class':'v_page'})
subTags = tag.find_all('a')
#获取页码数
return int(subTags[-2].get_text())
def getUrls(self,pages):
urlHead = 'http://dianying.2345.com/list/----2017---'
urlEnd = '.html'
for i in range(1,pages+1):
url = urlHead + str(i) +urlEnd
self.urls.append(url)
def spider(self,urls):
for url in urls:
htmlContent = self.getHTMLText(url)
soup = BeautifulSoup(htmlContent,'lxml')
anchorTag = soup.find('ul',attrs={'class':'v_picTxt pic180_240 clearfix'})
# print(anchorTag)
tags = anchorTag.find_all('li')
for tag in tags:
item = MovieItem()
item.movieName = tag.find('span',attrs ={'class':'sTit'}).getText()
item.movieScore = tag.find('span',attrs={'class':'pRightBottom'}).em.get_text().replace('分:','')
item.movieStaring = tag.find('span',attrs={'class':'sDes'}).get_text().replace('主演:','')
self.items.append(item)
def save(self,items):
count =0
fileName = '2017热门电影.txt'.encode('GBK')
tplt = "{0:^10}\t{1:<10}\t{2:^10}"
with codecs.open(fileName,'w','utf-8') as fp:
for item in items:
# fp.write('%s \t %s \t %s \r\n' %(item.movieName,item.movieScore,item.movieStaring))
# tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}"
fp.write(tplt.format(item.movieName,item.movieScore,item.movieStaring))
#这里处理的不好,以后填坑
count = count + 1
print('\r当前进度:{:.2f}%'.format(count*100/len(items),end=''))
if __name__ == '__main__':
GM = GetMovie()
去广告修正版
from bs4 import BeautifulSoup
import requests
import codecs
class MovieItem(object):
movieName = None
movieScore = None
movieStaring = None
class GetMovie(object):
def __init__(self):
self.urlBase = 'http://dianying.2345.com/list/----2018--.html'
self.pages = self.getPages()
self.urls = [] #存放拼接后的url
self.items = []
self.getUrls(self.pages)
self.spider(self.urls)
self.save(self.items)
def getHTMLText(self,url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def getPages(self):
html = self.getHTMLText(self.urlBase)
soup = BeautifulSoup(html,'lxml')
# tag = soup.find('div',attrs={'class':'v_picConBox mt15'})
tag = soup.find('div',attrs={'class':'v_page'})
subTags = tag.find_all('a')
#获取页码数
# print(subTags)
# print("aaaa",int(subTags[-2].get_text()))
return int(subTags[-2].get_text())
def getUrls(self,pages):
urlHead = 'http://dianying.2345.com/list/----2018---'
urlEnd = '.html'
for i in range(1,pages+1):
url = urlHead + str(i) +urlEnd
self.urls.append(url)
def spider(self,urls):
for url in urls:
htmlContent = self.getHTMLText(url)
soup = BeautifulSoup(htmlContent,'lxml')
anchorTag = soup.find('ul',attrs={'class':'v_picTxt pic180_240 clearfix'})
# print(anchorTag)
tags = anchorTag.find_all('li')
tags.pop(9)
# print(tags[9])
for tag in tags:
try:
item = MovieItem()
item.movieName = tag.find('span',attrs ={'class':'sTit'}).get_text().strip()
item.movieScore = tag.find('span',attrs={'class':'pRightBottom'}).em.get_text().replace('分:','')
item.movieStaring = tag.find('span',attrs={'class':'sDes'}).get_text().replace('主演:','')
# print(item.movieName,item.movieScore,item.movieStaring)
self.items.append(item)
except Exception as e:
raise e
def save(self,items):
count =0
fileName = '2018热门电影.txt'
tplt = "{0:^10}\t{1:^10}\t{2:^10}"
# tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}"
# for item in items:
# # print((tplt.format(item.movieName,item.movieScore,item.movieStaring,chr(12288))))
# print((tplt.format(item.movieName,item.movieScore,item.movieStaring)))
with codecs.open(fileName,'w','utf-8') as fp:
for item in items:
fp.write(tplt.format(item.movieName,item.movieScore,item.movieStaring)+'\n')
count = count + 1
print('\r当前进度:{:.2f}%'.format(count*100/len(items),end=''))
if __name__ == '__main__':
GM = GetMovie()