Step05:爬虫小项目,爬取最新电影迅雷下载地址

1.简述

由于电影天堂的广告实在令人不厌其烦,但其视频资源却的确有可取之处。因此,趁着学习爬虫技术的这段时间,简单实现了一个完整的小项目。
完整代码——链接

2.技术准备

IDE:Pycharm,python3.6.5,使用requests+re从电影天堂爬取最新电影资源的下载地址。使用tkinter设计简单的界面,中间还涉及使用了多线程技术,python对于多线程有threading库支持,简化了许多工作。

3.项目步骤

进入Pycharm建立project,实现以下目录结构:
目录结构
\ThunderAndSpider\message_spider\spider_config.py

headers={
    'Cookie':'37cs_user=37cs63629906334; XLA_CI=3e976860bea5549a9a73e10df8153fcd; 37cs_pidx=2; 37cs_show=253%2C75; cscpvrich5041_fidx=3',
    'Host':'www.dytt8.net',
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Encoding':'gzip, deflate',
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6726.400 QQBrowser/10.2.2265.400',
}

other_headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6726.400 QQBrowser/10.2.2265.400',
    'Referer':'http://www.dytt8.net/',
    'Host':'www.dytt8.net',
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
}

url_dytt = "http://www.dytt8.net"
re_strIndex = r'>最新电影下载</a>]<a\shref=\'(.*?)\'>(.*?)</a><br'
re_strLink = r'<td\sstyle="WORD-WRAP.*?<a\shref=".*?">(.*?)</a></td>'

\ThunderAndSpider\message_spider\dytt_spider.py(简单实现的爬虫类)

import requests
import re

from message_spider.spider_config import *
from requests.exceptions import RequestException

class dytt_spider:
    def __init__(self):
        self.url = url_dytt

    def get_html(self, url, headers):
        try:
            response = requests.get(url, headers=headers)
            # Python HTTP库requests中文页面乱码解决方案
            response.encoding = response.apparent_encoding
            if response.status_code == 200:
                return response.text
            return None
        except RequestException:
            return None

    def _get_re_findall_items(self, html, re_str):
        pattern = re.compile(re_str, re.S)
        items = re.findall(pattern, html)
        return items

    def get_index(self, html, re_str):
        items = self._get_re_findall_items(html, re_str)
        for item in items:
            yield {
                "name": item[1],
                "url": url_dytt + item[0]
            }

    def get_thunder_link(self, html, re_str):
        items = self._get_re_findall_items(html, re_str)
        for item in items:
            yield {
                "thunder": item
            }

    def get_all_thunderlink(self):
        index = self.get_html(self.url, headers)
        for item in self.get_index(index, re_strIndex):
            html = self.get_html(item['url'], other_headers)
            if html:
                for x in self.get_thunder_link(html, re_strLink):
                    yield {
                        "影片名:": item["name"],
                        "磁力链接:": x['thunder'],
                    }

if __name__=="__main__":
    dytt = dytt_spider(url_dytt)
    for mess in dytt.get_all_thunderlink():
        print(mess)

\ThunderAndSpider\thunder\thunder_config.py

#==注意,此处要修改为迅雷所在目录的完整路径==
thunder_path = 'E:\Thunder.exe'
#==注意,此处修改为迅雷下载文件存放目录==
save_path= 'G:\\thunder_download\\'

\ThunderAndSpider\thunder\dytt_thunder.py

import os, time
#import threading
from thunder.thunder_config import *

class my_thunder:
    def __init__(self, url):
        self.url = url
        self.filename = os.path.split(self.url)[1]
        self.args = r'"{thunder_path}" {url}'.format(thunder_path=thunder_path, url=url)

    def start_target(self):
        print("准备下载---{name}".format(name=self.filename))
        os.system(self.args)
        #new_thread = threading.Thread(target=os.system, args=(self.args,))
        #new_thread.start()

    def check_start(self):
        check_file=self.filename+".xltd"
        return os.path.exists(os.path.join(save_path, check_file))

    def check_end(self):
        return os.path.exists(os.path.join(save_path, self.filename))
    '''
    def download(self):
        self.start_target()
        print("正在下载{name}".format(name=self.filename))
        if self.check_start():
            while True:
                time.sleep(60)
                if self.check_end():
                    print("下载完成")
                    return True
        else:
            print("下载失败")
            return False
    '''

\ThunderAndSpider\win_gui\main_gui.py(此处设计界面)

from tkinter import *

class MainGUI:
    def __init__(self):
        self.root=Tk()
        self.root.title("电影下载")
        self.root.geometry("700x500")
        self.root.resizable(False, False)
        self._set_gui()

    def open_gui(self):
        self.root.mainloop()

    def _set_gui(self):
        Label(self.root, text="资源来源:").grid(row=0, column=0)
        self.entry_01 = Entry(self.root)
        self.entry_01.grid(row=0,column=1,sticky=W)
        Label(self.root, text="资源种子:").grid(row=1, column=0)
        self.text_01 = Text(self.root)
        self.text_01.grid(row=1,column=1, sticky=W)
        Label(self.root, text="当前电影:").grid(row=2, column=0)
        self.entry_02 = Entry(self.root, width=300)
        self.entry_02.grid(row=2,column=1,sticky=W)
        self.frm = Frame(self.root)
        self.frm.grid(row=3, column=1, sticky=W)
        self.btn_01 = Button(self.frm, text="上一部")
        self.btn_01.grid(row=0, column=1)
        self.btn_02 = Button(self.frm, text="下一部")
        self.btn_02.grid(row=0, column=2)
        self.btn_03 = Button(self.frm, text="下载当前部")
        self.btn_03.grid(row=0, column=3)

\ThunderAndSpider\main.py

from win_gui.main_gui import *
from thunder.dytt_thunder import *
from message_spider.dytt_spider import *
from message_spider.spider_config import url_dytt

import threading

urls = []
link_message = ""
urls_index = 0

def get_urls_and_linkmessage(spider):
    global urls
    global link_message
    for x in spider.get_all_thunderlink():
        link_message += x["影片名:"]
        link_message += "\n"
        link_message += x["磁力链接:"]
        link_message += "\n\n"
        urls.append(x['磁力链接:'])

def change_entry_a(entry):
    global urls_index
    if urls_index == 0:
        urls_index = len(urls) - 1
    else:
        urls_index = urls_index - 1
    entry.delete(0, END)
    entry.insert(20, urls[urls_index])

def change_entry_b(entry):
    global urls_index
    if urls_index ==len(urls)-1:
        urls_index = 0
    else:
        urls_index = urls_index + 1
    entry.delete(0,END)
    entry.insert(20,urls[urls_index])

def download_current():
    thunder = my_thunder(urls[urls_index])
    #new_thread = threading.Thread(target=thunder.download)
    new_thread = threading.Thread(target=thunder.start_target)
    new_thread.start()
    #thunder.download()

def mainGUI_config(mainWin):
    # INSERT索引表示在光标处插入,END索引号表示在最后插入
    mainWin.entry_01.insert(END, url_dytt)
    mainWin.text_01.insert(1.0, link_message)
    mainWin.entry_02.insert(20, urls[0])
    mainWin.btn_01.config(command=lambda: change_entry_a(mainWin.entry_02))
    mainWin.btn_02.config(command=lambda: change_entry_b(mainWin.entry_02))
    mainWin.btn_03.config(command=download_current)

def main():
    print("++++主程序启动++++")
    mainWin = MainGUI()
    spider = dytt_spider()

    get_urls_and_linkmessage(spider)
    mainGUI_config(mainWin)
    mainWin.open_gui()

if __name__ =="__main__":
    main()

4.项目成果

目标站点:
红框中为要抓取的信息
项目主界面
项目主界面

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

指尖码动

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值