环境:
python2.7
Windows10
使用软件:ECLIPSE
闲来无聊,很巧朋友给发福利了,便爬之。
用到的库文件
import urllib2 (python3中为urllib)
import re
主要由网页源码获取函数,文件读写函数,视频链接抓取,视频下载几个部分组成
def GetData(url): hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'none', 'Accept-Language': 'en-US,en;q=0.8', 'Connection': 'keep-alive'} #头部 req = urllib2.Request(url, headers=hdr) try: page = urllib2.urlopen(req) except urllib2.HTTPError, e: print e.fp.read() #异常处理 Data = page.read() #源代码下载 return Data
#读文件 def Read_File(file_name): with open(file_name,'r') as f: list1=[] for line1 in f: list1.append(line1) f.close() return list1 #写出视频链接 def SaveData(file_name, file_content): with open(file_name.replace('/', '_') + ".txt", "ab") as f: f.write(file_content) f.close() #下载视频文件 def SaveVideo(file_name, file_content): with open(file_name.replace('/', '_') + ".mp4", "ab") as f: f.write(file_content) f.close()
def PrintData(): Head_Agreement='https' Domain='porn-video7.com' File_Name='page' Suffix='.html' #网址分割 for Temp in range(1,5): #获取四个网页 Url_Link=Head_Agreement+'://'+Domain+'/'+File_Name+str(Temp)+Suffix Data = GetData(Url_Link) SaveData('Video_LinkTmp', Data+'\r\n') list1=Read_File('Video_LinkTmp.txt') #获取网页源代码并写出 for Tmp in list1: match = re.compile(r'<img data-mb="shuffle-thumbs" data-opt-timeout="500" data-opt-limit="10" src="(.*).mp4/',re.S) #正则匹配抓出视频链接 rs = match.search(Tmp) if rs: Video_Tmp=rs.group(1).strip()+'.mp4' match1 = re.compile(r'(.*)thumbs(.*)',re.S) rs1 = match1.search(Video_Tmp) if rs1: Temp=rs1.group(1).strip()+'videos'+rs1.group(2).strip() SaveData('Video_Link', Temp+'\r\n') #写出视频链接 return
#视频下载函数 def Download_Video(): list_video=Read_File('Video_Link.txt') Text_Name=1 for Temp_Video in list_video: match2 = re.compile(r'(.*).mp4',re.S) rs2 = match2.search(Temp_Video) if rs2: Url_Video=rs2.group(1).strip()+'.mp4' Adult_Video = GetData(Url_Video) SaveVideo(str(Text_Name), Adult_Video) print(str(Text_Name)+' video Download Working Done!\n') Text_Name+=1
PrintData() print('Video Link Working Done!\n') Download_Video() print('Video Download Working Done!\n')
下面是完整代码
import urllib2 import re def GetData(url): hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'none', 'Accept-Language': 'en-US,en;q=0.8', 'Connection': 'keep-alive'} req = urllib2.Request(url, headers=hdr) try: page = urllib2.urlopen(req) except urllib2.HTTPError, e: print e.fp.read() Data = page.read() return Data def Read_File(file_name): with open(file_name,'r') as f: list1=[] for line1 in f: list1.append(line1) f.close() return list1 def SaveData(file_name, file_content): with open(file_name.replace('/', '_') + ".txt", "ab") as f: f.write(file_content) f.close() def SaveVideo(file_name, file_content): with open(file_name.replace('/', '_') + ".mp4", "ab") as f: f.write(file_content) f.close() def PrintData(): Head_Agreement='https' Domain='porn-video7.com' File_Name='page' Suffix='.html' for Temp in range(1,5): Url_Link=Head_Agreement+'://'+Domain+'/'+File_Name+str(Temp)+Suffix Data = GetData(Url_Link) SaveData('Video_LinkTmp', Data+'\r\n') list1=Read_File('Video_LinkTmp.txt') for Tmp in list1: match = re.compile(r'<img data-mb="shuffle-thumbs" data-opt-timeout="500" data-opt-limit="10" src="(.*).mp4/',re.S) rs = match.search(Tmp) if rs: Video_Tmp=rs.group(1).strip()+'.mp4' match1 = re.compile(r'(.*)thumbs(.*)',re.S) rs1 = match1.search(Video_Tmp) if rs1: Temp=rs1.group(1).strip()+'videos'+rs1.group(2).strip() SaveData('Video_Link', Temp+'\r\n') return def Download_Video(): list_video=Read_File('Video_Link.txt') Text_Name=1 for Temp_Video in list_video: match2 = re.compile(r'(.*).mp4',re.S) rs2 = match2.search(Temp_Video) if rs2: Url_Video=rs2.group(1).strip()+'.mp4' Adult_Video = GetData(Url_Video) SaveVideo(str(Text_Name), Adult_Video) print(str(Text_Name)+' video Download Working Done!\n') Text_Name+=1 PrintData() print('Video Link Working Done!\n') Download_Video() print('Video Download Working Done!\n')
运行效果