Python---爬取微信的文章

爬取微信的文章

# 模拟浏览器和设置代理来获取数据
def use_proxy(proxy_addr, url):
    # 模拟浏览器访问
    headers = ('User-agent', 'Moz+illa/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36')
    # 设置http代理
    proxy = urllib.request.HTTPHandler({'https': proxy_addr})
    opener = urllib.request.build_opener(proxy, urllib.request.HTTPHandler)
    opener.addheaders = [headers]
    # 创建全局的opener
    urllib.request.install_opener(opener)
    # 发送请求获取数据
    data = urllib.request.urlopen(url).read().decode('utf-8')
    return data
# 获取每页文章的链接
def getListUrl(keywords, pagestart, pageend, proxy):
    # 存放获取每页的文章链接
    listUrls = []
    # 对搜索关键字进行编码
    keywords_encode = urllib.request.quote(keywords)
    page_encode = urllib.request.quote("&page")
    # 获取每页的文章链接将其保存
    try:
        for page in range(pagestart, pageend):
            url = "http://weixin.sogou.com/weixin?type=2&query="+keywords_encode+page_encode+str(page)
            print(url)
            data = use_proxy(proxy, url)
            print(data)
            # 过滤每页的链接
            pattrn = '<a data-z="art" target="_blank" id=".*?" href="(http://.*?)"'
            results = re.compile(pattrn, re.S).findall(data)
            # 保存每页链接的数据
            listUrls.append(results)
        return listUrls
    except urllib.request.URLError as e:
        if hasattr(e, "code"):
            print(e.code)
        if hasattr(e, "reason"):
            print(e.reason) 
        # 如果是URLError错误就等待10秒
        time.sleep(10)    
    except Exception as e:
        print("出现异常") 
        # 如果是Exception就等待1秒
        time.sleep(1)
# 获取每个文章对应的标题和内容
def getContent(litsUrls, proxy):
    # 网页的头部
    html1 = '''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
    <html xmlns="http://www.w3.org/1999/xhtml">
    <head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    <title>微信文章</title>
    </head>
    <body>'''
    dataStart = open("D:/python/file/9.html", "wb")
    dataStart.write(html1.encode('utf-8'))
    dataStart.close()
    # listUrls是一个二维的列表[][],存放这每页的数据和每页文章的链接
    for i in litsUrls:
        for j in litsUrls[i]:
            try:
                url = litsUrls[i][j]
                # 观察每个文章的链接地址与浏览器上地址不一样,多了一些“map;”,因此我们需要将其处理掉
                deal_url = url.replace("&map;", "&")
                # 通过代理去爬取网页数据
                articelData = use_proxy(proxy, deal_url)
                # 文章标题的正则
                title_pattrn = '<title>(.*?)</title>'
                title = re.compile(title_pattrn, re.S).findall(articelData)
                print("标题:"+title)
                # 文章内容的正则
                content_pattrn = 'id="js_content">(.*?)id="js_sg_bar"'
                content = re.compile(content_pattrn, re.S).findall(articelData)
                print("内容:"+title)
                articelTitle = "没有获取到标题"
                articelContent = "没有获取到内容"
                if title != []:
                    articelTitle = title[0]
                if content != []:
                    articelContent = content[0]  
                detail = "<p>标题为:"+articelTitle+"</p><p>内容为:"+articelContent+"</p><br>" 
                detailContent = open("D:/python/file/9.html", "ab")
                detailContent.write(detail.encode('utf-8')) 
                detailContent.close()
                print("第"+str(i)+"页的数据处理了"+str(j)+"次")   
            except urllib.request.URLError as e:
                if hasattr(e, "code"):
                    print(e.code)
                if hasattr(e, "reason"):
                    print(e.reason)
                # 如果是URLError错误就等待10秒    
                time.sleep(10)    
            except Exception as e:
                print("出现异常") 
                # 如果是Exception就等待1秒
                time.sleep(1)
    # 网页的尾部            
    html2 = '''</body>
              </html>'''
    dataEnd = open("D:/python/file/9.html", "ab")
    dataEnd.write(html2.encode('utf-8'))
    dataEnd.close()
# 代理设置
proxy1 = "344.48.5.154:8989"
proxy2 = "303.213.247.12:8989"
pageStart = 1
pageEnd = 2
# 搜索的关键字
keywords = "物联网"
listUrls = getListUrl(keywords, pageStart, pageEnd, proxy2)
getContent(listUrls, proxy2)

如果各位想用的话,请换上自己的代理ip。我这个免费的已经失效了。

转载于:https://my.oschina.net/quguangle/blog/1840774

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值