看起来挺简单,实践起来各种问题,包括python2和3的语法问题,总算能稍微download下网页上的链接,但是经常出现bug,譬如url中存在编码问题,http版本问题等,后面继续摸索吧。。。 #!/usr/bin/env python3 import os import ssl from html.parser import HTMLParser import urllib.parse import urllib.request import http.client http.client.HTTPConnection._http_vsn = 10 http.client.HTTPConnection._http_vsn_str = 'HTTP/1.0' ssl._create_default_https_context = ssl._create_unverified_context import sys class pageDeal(HTMLParser): def __init__(self, url): HTMLParser.__init__(self) self.url, self.file = self.getUrl(url) self.data = set() def getUrl(self, url): 'product usable filename' parsed = urllib.parse.urlparse(url) host = parsed.netloc.split('@')[-1].split(':')[0] #print(host) filePath = '%s%s' % (host, parsed.path) if not os.path.splitext(parsed.path)[1]: filePath = os.path.join(filePath, 'index.html') #print(filePath) linkDir = os.path.dirname(filePath) if not os.path.isdir(linkDir): if os.path.exists(linkDir): os.unlink(linkDir) os.makedirs(linkDir) return url, filePath def handle_starttag(self, tag, attrs): if tag == 'a': for name, value in attrs: if name == 'href' and 'https' in value: self.data.add(value) def download(self): 'Download file base url' print(self.file, ' ', self.url) try: ret = urllib.request.urlretrieve(self.url, self.file) except (IOError, urllib.request.URLError) as e: ret = (('*** ERROR: bad URL %s: %s' % (self.url, e)),) return ret def parserLink(self): 'product html list' with open(self.file, 'r') as f: try: page = f.read() except http.client.IncompleteRead as e: return self.data self.feed(page) return self.data class sumInfo(object): count = 0 def __init__(self, url): self.q = [url] self.seen = set() parsed = urllib.request.urlparse(url) host = parsed.netloc.split('@')[-1].split(':')[0] self.dom = '.'.join(host.split('.')[-2:]) def main(): # if len(sys.argv) > 1: # url = sys.argv[1] # else: # try: # url = input('input URL : ') # except (KeyboardInterrupt, EOFError): # url = '' # if not url: # print('input not satify,exit') # return # if not url.startswith('http://') and not url.startswith('ftp://'): # url = 'http://%s' % url #url = 'http://tieba.baidu.com/p/2256306796' url = 'https://stackoverflow.com/questions/36998191/typeerror-not-a-valid-non-string-sequence-or-mapping-object' mainPage = pageDeal(url) print(mainPage.download()) inPage = mainPage.parserLink() print('URL: %s\nFILE: %s' % (mainPage.url, mainPage.file)) print(inPage) for iUrl in inPage: print(iUrl) iPage = pageDeal(iUrl) iPage.download() #print('URL: %s\nFILE: %s' % (iPage.url, iPage.file)) if __name__ == '__main__': main() #ret = urllib.request.urlretrieve('https://stackoverflow.com/questions/36998191/typeerror-not-a-valid-non-string-sequence-or-mapping-object', 'w00444862.flag')
python学习爬虫
最新推荐文章于 2020-12-04 08:16:23 发布