爬取电影天堂全站电影

最新推荐文章于 2021-09-26 10:53:58 发布

wywwzjj

最新推荐文章于 2021-09-26 10:53:58 发布

阅读量1.8w

点赞数

分类专栏： Python

本文链接：https://blog.csdn.net/weixin_42348709/article/details/84594534

版权

新博客：https://wywwzjj.top/

具体分析以后再补，静态页面也没啥好分析的。

import requests, re
from bs4 import BeautifulSoup
import xlsxwriter
import datetime
from lxml import etree

def get_URLs(URL, page):
    # URLs = []
    # html = get_html(start_url)
    # soup = BeautifulSoup(html, 'lxml')
    # urls = soup.find(id='menu')
    # pattern = re.compile(r'href="(.+)">')  # 匹配模式
    # res = re.findall(pattern, str(urls))
    # host = 'http://www.ygdy8.net/'
    # for u in res:
    #     if 'http' not in u:
    #         u = host + u
    #     URLs.append(u)
    # URLs.insert(10, URLs[-3])
    # del URLs[1]
    # url = URLs[:10]
    # html = get_html(url[0])
    # soup = BeautifulSoup(html, 'lxml')
    # page = soup.find(class_='x')
    domain = 'http://www.ygdy8.net'
    start_url = URL
    url = start_url + page + '.html'
    html = get_html(url)
    soup = BeautifulSoup(html, 'lxml')
    urls = soup.find_all(class_='ulink')
    # print(urls)
    pattern = re.compile(r'href="(.+?)">')  # 匹配模式
    res = re.findall(pattern, str(urls))
    #print(len(res))
    # print(res)
    for u in res:
        if 'index' in u:
            res.pop(res.index(u))
    # 每页落掉了两个
    urls = list(map(lambda u: domain+u, res))
    return urls

def get_html(url):
     try:
         headers = {
   
             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36",
             'ContentType':