python学习爬虫

看起来挺简单,实践起来各种问题,包括python2和3的语法问题,总算能稍微download下网页上的链接,但是经常出现bug,譬如url中存在编码问题,http版本问题等,后面继续摸索吧。。。

#!/usr/bin/env python3


import os
import ssl
from html.parser import HTMLParser
import urllib.parse
import urllib.request
import http.client
http.client.HTTPConnection._http_vsn = 10
http.client.HTTPConnection._http_vsn_str = 'HTTP/1.0'
ssl._create_default_https_context = ssl._create_unverified_context

import sys


class pageDeal(HTMLParser):

    def __init__(self, url):
        HTMLParser.__init__(self)
        self.url, self.file = self.getUrl(url)
        self.data = set()

    def getUrl(self, url):
        'product usable filename'
        parsed = urllib.parse.urlparse(url)
        host = parsed.netloc.split('@')[-1].split(':')[0]
        #print(host)
        filePath = '%s%s' % (host, parsed.path)
        if not os.path.splitext(parsed.path)[1]:
            filePath = os.path.join(filePath, 'index.html')
        #print(filePath)
        linkDir = os.path.dirname(filePath)
        if not os.path.isdir(linkDir):
            if os.path.exists(linkDir):
                os.unlink(linkDir)
            os.makedirs(linkDir)
        return url, filePath

    def handle_starttag(self, tag, attrs):
        if tag == 'a':
            for name, value in attrs:
                if name == 'href' and 'https' in value:
                    self.data.add(value)

    def download(self):
        'Download file base url'
        print(self.file, '  ', self.url)
        try:
            ret = urllib.request.urlretrieve(self.url, self.file)
        except (IOError, urllib.request.URLError) as e:
            ret = (('*** ERROR: bad URL %s: %s' % (self.url, e)),)
        return ret

    def parserLink(self):
        'product html list'
        with open(self.file, 'r') as f:
            try:
                page = f.read()
            except http.client.IncompleteRead as e:
                return self.data
            self.feed(page)
            return self.data

class sumInfo(object):
    count = 0

    def __init__(self, url):
        self.q = [url]
        self.seen = set()
        parsed = urllib.request.urlparse(url)
        host = parsed.netloc.split('@')[-1].split(':')[0]
        self.dom = '.'.join(host.split('.')[-2:])

def main():
    # if len(sys.argv) > 1:
    #     url = sys.argv[1]
    # else:
    #     try:
    #         url = input('input URL : ')
    #     except (KeyboardInterrupt, EOFError):
    #         url = ''
    # if not url:
    #     print('input not satify,exit')
    #     return
    # if not url.startswith('http://') and not url.startswith('ftp://'):
    #     url = 'http://%s' % url
    #url = 'http://tieba.baidu.com/p/2256306796'
    url = 'https://stackoverflow.com/questions/36998191/typeerror-not-a-valid-non-string-sequence-or-mapping-object'
    mainPage = pageDeal(url)
    print(mainPage.download())
    inPage = mainPage.parserLink()
    print('URL:  %s\nFILE:  %s' % (mainPage.url, mainPage.file))
    print(inPage)
    for iUrl in inPage:
        print(iUrl)
        iPage = pageDeal(iUrl)
        iPage.download()
        #print('URL:  %s\nFILE:  %s' % (iPage.url, iPage.file))


if __name__ == '__main__':
    main()
    #ret = urllib.request.urlretrieve('https://stackoverflow.com/questions/36998191/typeerror-not-a-valid-non-string-sequence-or-mapping-object', 'w00444862.flag')
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值