用python写了个小爬虫,用来爬取电影天堂首页放置的几十部电影的名称,上映日期和下载链接,用到了beautifulsoup库和lxml库用来解析
代码如下:
import requests
import re
from lxml import etree
from bs4 import BeautifulSoup
from pandas import DataFrame
class dytt:
def __init__(self, url, header):
#定义url和header
self.url = url
self.header = header
#定义第二个页面url的列表
self.url2 = []
#定义电影名称列表、上映日期和下载链接列表
self.name = []
self.date = []
self.download = []
#获取电影名字的页面源码
def context1(self):
res = requests.get(self.url, headers=self.header)
res.encoding = 'gb2312'
return res.text
#获取日期和下载链接的页面源码
def context2(self, url):
res = requests.get("https://www.dytt8.net/"+url, headers=self.header)
res.encoding = 'gb2312'
return res.text
def nameMain(self):
bs = BeautifulSoup(self.context1(), 'html.parser')
serach = bs.find("div", attrs={"class":"bd3"})
serach = serach.find_all("div", attrs={"class":"bd3r"})[0].div.div.select('div[class="co_area2"]')
for tag in serach:
finds = tag.find_all("div", class_="co_content8")
if len(finds) > 0:
finds = finds[0].ul.table.select("tr")
for f in finds:
try:
moviename = f.td.select('a')[1].get_text()
moviename = moviename.split("《")[1].split("》")[0]
self.name.append(moviename)
#获取第二个页面的url, 并添加到url列表中
href = f.td.select('a')[1].get('href')
self.url2.append(href)
except Exception as e:
continue
#获取电影上映日期和下载链接
def download_date(self):
i = 1
for url in self.url2:
et = etree.HTML(self.context2(url))
#获取日期
dt = et.xpath('//div[@class="bd3l"]/div[@class="co_area2"]/div[@class="co_content8"]/ul')
dt = dt[0].xpath('.//div[@id="Zoom"]')
dt = dt[0].xpath('.//text()')
dt = "".join(dt).split("上映日期")[1].split("◎")[0].strip()
self.date.append(dt)
#获取下载链接
dl = et.xpath('//div[@class="bd3l"]/div[@class="co_area2"]/div[@class="co_content8"]/ul')
dl = dl[0].xpath('.//div[@id="Zoom"]')
dl = dl[0].xpath('.//table')
dl = dl[0].xpath('.//tr/td/a/text()')
self.download.append(dl[0])
print("爬取成功"+str(i))
i += 1
#保存到excel
def save_excel(self):
print("正在保存为excel文件...")
df = DataFrame([self.name, self.date, self.download]).T
df.columns = ["电影名称", "上映日期", "下载链接"]
df.to_excel("movie.xlsx", encoding='utf-8_sig')
print("保存成功!")
if __name__ == '__main__':
url = "https://www.dytt8.net/"
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"}
dy = dytt(url, header)
dy.nameMain()
dy.download_date()
dy.save_excel()
运行效果如下: