#!/usr/bin/python #encoding:utf-8 """ @author: LlQ @contact:LIQINGLIN54951@gmail.com @file:cp6_p113.py @time: 4/21/2019 2:30 AM """ # The os module acts as an interface between Python and the operating system, # allowing it to manipulate file paths, create directories, get information about # running processes and environment variables, and many other useful things #Notice that Python’s os module is used briefly to retrieve the target directory #for each download and create missing directories along the path if needed. #The os module acts as an interface between Python and the operating system, #allowing it to manipulate file paths, create directories, get information about #running processes and environment variables, and many other useful things. import os from urllib.request import urlretrieve from urllib.request import urlopen from bs4 import BeautifulSoup downloadDirectory = 'downloaded' baseUrl = 'http://pythonscraping.com' def getAbsoluteURL(baseUrl, source): if source.startswith('http://www.'): # len('http://www.')==11 url = 'http://{}'.format(source[11:]) elif source.startswith('http://'): url = source elif source.startswith('www.'): url = source[4:] url = 'http://{}'.format(source) else: url = '{}/{}'.format(baseUrl, source) #baseUrl = 'http://pythonscraping.com' if baseUrl not in url: return None return url # downloadDirectory = 'downloaded' def getDownloadPath(baseUrl, absoluteUrl, downloadDirectory): path = absoluteUrl.replace('www.', '') path = path.replace(baseUrl, '') path = downloadDirectory+path directory = os.path.dirname(path) if not os.path.exists(directory): os.makedirs(directory) return path html = urlopen('http://www.pythonscraping.com') bs = BeautifulSoup(html, 'html.parser') #select all tags on the front page that have the src attribute downloadList = bs.findAll(src=True) # print(downloadList) #then cleans and normalizes the URLs to get an absolute path for each download for download in downloadList: print("######Download: ", download) fileUrl = getAbsoluteURL(baseUrl, download['src']) if fileUrl is not None: print("******AbsoluteURL: ",fileUrl) if fileUrl and fileUrl.endswith("\.jpg"): # imageLocation urlretrieve(fileUrl, getDownloadPath(baseUrl, fileUrl, downloadDirectory)) #each file is downloaded to its own path in the local folder downloaded # on your own machine.
cp6_p113_Store URL to Directories
最新推荐文章于 2024-03-18 22:02:23 发布