BS4爬取电影天堂的下载地址并保存至csv文件(一)

修改:
(1)修改了结果中存在乱码的问题;

# coding=utf-8
import requests
from bs4 import BeautifulSoup
import time
import csv


def getHtml(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0'}
    page = requests.get(url, headers=headers)
    page.encoding = "gb2312"  #注意点   
    html = page.text          #注意点
    #print(html)

    soup = BeautifulSoup(html, 'html.parser')
    for li in soup.find_all('div', class_="co_content8"):
        for url_info in li.find_all('a', class_="ulink"):
            moviename= url_info.get_text()
            url_1= 'http://www.dytt8.net' + url_info['href']
            #return moviename
            #return url_1
            #print(moviename)
            #print(url_1)
            #详情页
            req2 = requests.get(url_1, headers=headers)
            #print(req2.encoding)   #ISO-8859-1       #查看网页返回的字符集类型
            #print(req2.apparent_encoding)  #GB2312   #自动判断字符集类型
            req2.encoding = "gb2312"
            #content2 = req2.content  #注意区别(.content和.text)
            content2 = req2.text
            soup = BeautifulSoup(content2, 'html.parser')
            for td in soup.find_all('td', attrs={'style': 'WORD-WRAP: break-word'}):
                for url_2 in td.find_all('a'):
                    #print(req2.apparent_encoding)
                    #url_3 = url_2['href']
                    #url_3 = url_2.text
                    url_3=url_2.string
                    url_3.encoding = 'gbk'  #注意点
                    #print(url_3)
                    #print(url_2.string)
                    item = {  # 将获取的结果存储为字典
                        "moviename": moviename,
                        "movielink": url_1,
                        "ftplink": url_3
                    }
                    print(item)
                    save_result(item)  # 每次获取一个结果后,存储一次
                    item.clear()  # 存储后清空字典,为下次存储做准备
#存储
def save_result(item):
    #保存在TXT
    #with open('result.txt','a ')as f:
        #f.write(json.dumps(content) + '\n')
        #f.close()
    #保存在csv中
    '''with open('dy.csv', 'a', newline='') as csvfile:  # 写入表头
        writer = csv.writer(csvfile)
        writer.writerow(['name', 'link', 'link3'])
    with open('dy.csv', 'a', newline='',encoding='utf-8') as csvfile:  # 打开一个csv文件,用于存储
        fieldnames = ['name', 'link','link3']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writerow(item)'''
    with open('dy.csv', 'a', newline='',encoding='utf-8') as csvfile:  # 写入表头
        fieldnames = ['moviename', 'movielink', 'ftplink']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writerow(item)
def main():
    #url = "https://www.dytt8.net/html/gndy/dyzz/index.html"
    #getHtml(url)

    #翻页1
    '''urls = ['https://www.dytt8.net/html/gndy/dyzz/list_23_1.html',
            'https://www.dytt8.net/html/gndy/dyzz/list_23_2.html'
            ]
    for url in urls:
        getHtml(url)
        time.sleep(2)'''

    #翻页2
    for i in range(1,4):
        print('正在访问第'+format(i)+'页')
        url ='https://www.dytt8.net/html/gndy/dyzz/list_23_'+ str(i)+'.html'
        getHtml(url)
        time.sleep(3)
if __name__ == '__main__':
    main()

在这里插入图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值