第一次完成爬虫项目记录

磕磕碰碰的在老师帮助下完成了爬虫,记录下代码

import os
import logging
import requests
from bs4 import BeautifulSoup
import re

logging.basicConfig(level=logging.INFO)


def store_star_img(star_url,store_star_dir):
    star_text = requests.get(star_url).text
    soup = BeautifulSoup(star_text,'lxml')
    star_src = soup.select('.content-pic img')[0]['src']
    file_name = star_src.split('/')[-1]
    file_name = os.path.join(store_star_dir,file_name)
    print(file_name)
    headers = {
        'Referer': star_url,
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36'
    }
    with open(file_name,'wb') as f:
        star_content = requests.get(star_src,headers=headers).content
        f.write(star_content)



def store_page_star(href,store_star_dir):
    #把href下的明星的图片放到store_star_dir中
    url = 'http://www.mm131.com/mingxing'
    star_text = requests.get(href).text
    soup = BeautifulSoup(star_text,'lxml')
    star_url = f'{href}'
    picter_url = soup.find('div',class_='content-page').find_all('a')
    for p_url in picter_url:
        p_name = p_url.get_text()
        p_name = re.sub('[\s:?]','',p_name)
        p_href = p_url['href']
        store_star_img(star_url,store_star_dir)
        star_url = f'{url}/{p_href}'


def main(url):

    store_dir = 'mingxing'
    os.makedirs(store_dir,exist_ok=True)
    # r= requests.get(url)
    # r.encoding = 'gbk'
    # home_txt = r.text

    #网站编码是ISO-98851的,直接获取text是乱码,所以要进行重新编码
    r_text = requests.get(url).content.decode('gbk')
    soup = BeautifulSoup(r_text,'lxml')
    ahref_list = soup.find('dl',class_='list-left public-box').find_all('a',target='_blank')
    for ahref in ahref_list:
        starname = ahref.get_text()
        starname = re.sub('[\s:?]','',starname)
        href = ahref.attrs['href']
        store_star_dir = os.path.join(store_dir,starname)
        os.makedirs(store_star_dir,exist_ok=True)
        logging.info(f'开始下载{starname}的图片')
        store_page_star(href,store_star_dir)#把链接下面的图片下载到文件夹中




if __name__ == '__main__':
    main_url = 'http://www.mm131.com/mingxing/'
    r_text = requests.get(main_url).content.decode('gbk')
    soup = BeautifulSoup(r_text,'lxml')
    url_list = soup.find('dd',class_='page').find_all('a')
    for url in url_list:
        href = url.attrs['href']
        href = f'{main_url}/{href}'
        main(href)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值