#!/usr/bin/python3
# coding=utf8
import requests
from bs4 import BeautifulSoup
import pymysql
import time
'''
需求:某视频网站,没有搜索功能,我弄个python爬虫爬取网站视频名称和磁力链接,全部爬取下来放到mysql数据库中,就可以按自己喜好搜索关键字获得影片下载地址进行下载了
作者:xiaoxiaohui
时间:2019-10-03
其他:mysql数据库创建数据库和数据表
mysql -uroot -pxxh123
create database 4hucom;
use 4hucom;
数据库id自增长
CREATE TABLE `4hu_shoujixiaoshipin` (`id` INT(11) not null auto_increment,`biaoti` VARCHAR(380), `fabutime` VARCHAR(380), `lianjie` VARCHAR(380),primary key(id) );
其他2:因为是通过之前一些爬虫代码快速改进,所以关于(1)关于方法名称get_house_info都是沿用之前爬取租房网站的名称啦(2)info字典里面这个'播放地址':fabutime,其实'播放地址'改为bofangdizhi比较好
'''
def get_links(url):
response = requests.get(url)
soup = BeautifulSoup(response.text,'html.parser')
links_div = soup.find_all('li',class_="col-md-2 col-sm-3 col-xs-4")
links = ['http://www.网站名马赛克.com'+div.a.get('href') for div in links_div]
#print(links)
return links
def get_house_info(item_url):
response = requests.get(item_url)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text,'html.parser')
links_div = soup.find_all('ul',class_="playul")
lianjie_temp = 'http://www.网站名马赛克.com'+links_div[1].li.a.get('href')#爬下载链接 这里注意playul有2个 第一个playul links_div[0]是播放的 第二个playul links_div[1]是下载的
lianjie=get_cililianjie(lianjie_temp)
print(lianjie)
links_div2 = soup.find_all('div',class_="detail-title fn-clear")
biaoti = links_div2[0].text[:].strip() #爬影片名字 我加了.strip() 去空格
#print(biaoti)
links_div3 = soup.find_all('ul',class_="playul")
fabutime = 'http://www.网站名马赛克.com'+links_div[0].li.a.get('href') #爬影片播放地址
#print(fabutime)
info = {
'id':id,
'影片名字':biaoti,
'播放地址':fabutime,
'下载链接':lianjie
}
return info
def get_cililianjie(url):
response = requests.get(url)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text,'html.parser')
#print(soup)
links_div = soup.find_all('div',class_="download")
#print(links_div)
lianjie = links_div[0].a.get('href') #磁力链接
return lianjie
def get_db(setting):
return pymysql.connect(**setting)
def insert(db,house):
values_ = "'{}',"*2 + "'{}'"
sql_values = values_.format(house['影片名字'],house['播放地址'],house['下载链接'])
sql ='insert into 4hu_shoujixiaoshipin (biaoti,fabutime,lianjie) values({})'.format(sql_values)
cursor = db.cursor()
cursor.execute(sql)
db.commit()
DATABASE = {
'host':'127.0.0.1',
'database':'4hucom',
'user':'root',
'password':'xxh123',
'charset':'utf8' #之前代码是utf8mb4之后我用navicat.exe查看一直是乱码 改成utf8 发现navicat.exe查是正常中文了
}
db = get_db(DATABASE) #连接数据库
#循环所有页例子
for yema in range(1,44):
if yema == 1:
url = 'https://www.网站名马赛克.com/vod/html7/index.html'
else:
url = 'https://www.网站名马赛克.com/vod/html7/index_'+str(yema)+'.html'
links = get_links(url)
for item_url in links:
time.sleep(1.0)
house = get_house_info(item_url)
print('获取一条成功:{}'.format(house['影片名字']))
insert(db,house) #插入爬取到的数据输入进数据库