磕磕碰碰的在老师帮助下完成了爬虫,记录下代码
import os
import logging
import requests
from bs4 import BeautifulSoup
import re
logging.basicConfig(level=logging.INFO)
def store_star_img(star_url,store_star_dir):
star_text = requests.get(star_url).text
soup = BeautifulSoup(star_text,'lxml')
star_src = soup.select('.content-pic img')[0]['src']
file_name = star_src.split('/')[-1]
file_name = os.path.join(store_star_dir,file_name)
print(file_name)
headers = {
'Referer': star_url,
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36'
}
with open(file_name,'wb') as f:
star_content = requests.get(star_src,headers=headers).content
f.write(star_content)
def store_page_star(href,store_star_dir):
#把href下的明星的图片放到store_star_dir中
url = 'http://www.mm131.com/mingxing'
star_text = requests.get(href).text
soup = BeautifulSoup(star_text,'lxml')
star_url = f'{href}'
picter_url = soup.find('div',class_='content-page').find_all('a')
for p_url in picter_url:
p_name = p_url.get_text()
p_name = re.sub('[\s:?]','',p_name)
p_href = p_url['href']
store_star_img(star_url,store_star_dir)
star_url = f'{url}/{p_href}'
def main(url):
store_dir = 'mingxing'
os.makedirs(store_dir,exist_ok=True)
# r= requests.get(url)
# r.encoding = 'gbk'
# home_txt = r.text
#网站编码是ISO-98851的,直接获取text是乱码,所以要进行重新编码
r_text = requests.get(url).content.decode('gbk')
soup = BeautifulSoup(r_text,'lxml')
ahref_list = soup.find('dl',class_='list-left public-box').find_all('a',target='_blank')
for ahref in ahref_list:
starname = ahref.get_text()
starname = re.sub('[\s:?]','',starname)
href = ahref.attrs['href']
store_star_dir = os.path.join(store_dir,starname)
os.makedirs(store_star_dir,exist_ok=True)
logging.info(f'开始下载{starname}的图片')
store_page_star(href,store_star_dir)#把链接下面的图片下载到文件夹中
if __name__ == '__main__':
main_url = 'http://www.mm131.com/mingxing/'
r_text = requests.get(main_url).content.decode('gbk')
soup = BeautifulSoup(r_text,'lxml')
url_list = soup.find('dd',class_='page').find_all('a')
for url in url_list:
href = url.attrs['href']
href = f'{main_url}/{href}'
main(href)