网址:电影港
爬取内容:爬取电影港网站中的动漫信息
import requests
import re
from bs4 import BeautifulSoup
hd = h = {'user-agent': 'Mozilla/5.0'}
url = 'http://www.dygangs.com/dmq/'
try:
# 初始化
def init(url, hd):
r = requests.get(url, headers=hd)
r.raise_for_status
r.encoding = r.apparent_encoding
demo = r.text
soup = BeautifulSoup(demo, 'html.parser')
return soup
# 获取动漫信息
def getInfo(soup):
table = soup.find_all('table')[5]
list = table.find_all('a', {'class': 'classlinkclass'}) # 名称
img = table.find_all('img') # 图片链接
for i in range(len(list)):
print("动漫名称:", list[i].string)
print("动漫封面:", img[i]['src'])
print("播放页面:", list[i]['href'])
print("-*-"*20)
# 动漫的总页数
soup = init(url, hd)
page_num = int(int(soup.find('a', {'title': 'Total record'}).text)/10)
# 用户输入
num = (int(input("输入你要爬取那一页的信息:")))
if num == 1:
getInfo(soup)
elif num > 1 and num <= page_num:
p_url = url+"index_"+str(num)+".htm"
p_soup = init(p_url, hd)
getInfo(p_soup)
else:
print("输入的页码数不正确")
except:
print("爬取失败")
爬虫 —— 爬取网络小说,详细分析及代码
传送门