Python---爬取微信的文章

最新推荐文章于 2019-08-29 16:54:11 发布

weixin_34279184

最新推荐文章于 2019-08-29 16:54:11 发布

阅读量101

点赞数

文章标签： python xhtml 嵌入式

原文链接：https://my.oschina.net/quguangle/blog/1840774

版权

2019独角兽企业重金招聘Python工程师标准>>>

爬取微信的文章

# 模拟浏览器和设置代理来获取数据
def use_proxy(proxy_addr, url):
    # 模拟浏览器访问
    headers = ('User-agent', 'Moz+illa/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36')
    # 设置http代理
    proxy = urllib.request.HTTPHandler({'https': proxy_addr})
    opener = urllib.request.build_opener(proxy, urllib.request.HTTPHandler)
    opener.addheaders = [headers]
    # 创建全局的opener
    urllib.request.install_opener(opener)
    # 发送请求获取数据
    data = urllib.request.urlopen(url).read().decode('utf-8')
    return data

# 获取每页文章的链接
def getListUrl(keywords, pagestart, pageend, proxy):
    # 存放获取每页的文章链接
    listUrls = []
    # 对搜索关键字进行编码
    keywords_encode = urllib.request.quote(keywords)
    page_encode = urllib.request.quote("&page")
    # 获取每页的文章链接将其保存
    try:
        for page in range(pagestart, pageend):
            url = "http://weixin.sogou.com/weixin?type=2&query="+keywords_encode+page_encode+str(page)
            print(url)
            data = use_proxy(proxy, url)
            print(data)
            # 过滤每页的链接
            pattrn = '<a data-z="art" target="_blank" id=".*?" href="(http://.*?)"'
            results = re.compile(pattrn, re.S).findall(data)
            # 保存每页链接的数据
            listUrls.append(results)
        return listUrls
    except urllib.request.URLError as e:
        if hasattr(e, "code"):
            print(e.code)
        if hasattr(e, "reason"):
            print(e.reason) 
        # 如果是URLError错误就等待10秒
        time.sleep(10)    
    except Exception as e:
        print("出现异常") 
        # 如果是Exception就等待1秒
        time.sleep(1)

# 获取每个文章对应的标题和内容
def getContent(litsUrls, proxy):
    # 网页的头部
    html1 = '''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
    <html xmlns="http://www.w3.org/1999/xhtml">
    <head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    <title>微信文章</title>
    </head>
    <body>'''
    dataStart = open("D:/python/file/9.html", "wb")
    dataStart.write(html1.encode('utf-8'))
    dataStart.close()
    # listUrls是一个二维的列表[][],存放这每页的数据和每页文章的链接
    for i in litsUrls:
        for j in litsUrls[i]:
            try:
                url = litsUrls[i][j]
                # 观察每个文章的链接地址与浏览器上地址不一样，多了一些“map;”,因此我们需要将其处理掉
                deal_url = url.replace("&map;", "&")
                # 通过代理去爬取网页数据
                articelData = use_proxy(proxy, deal_url)
                # 文章标题的正则
                title_pattrn = '<title>(.*?)</title>'
                title = re.compile(title_pattrn, re.S).findall(articelData)
                print("标题："+title)
                # 文章内容的正则
                content_pattrn = 'id="js_content">(.*?)id="js_sg_bar"'
                content = re.compile(content_pattrn, re.S).findall(articelData)
                print("内容："+title)
                articelTitle = "没有获取到标题"
                articelContent = "没有获取到内容"
                if title != []:
                    articelTitle = title[0]
                if content != []:
                    articelContent = content[0]  
                detail = "<p>标题为:"+articelTitle+"</p><p>内容为:"+articelContent+"</p><br>" 
                detailContent = open("D:/python/file/9.html", "ab")
                detailContent.write(detail.encode('utf-8')) 
                detailContent.close()
                print("第"+str(i)+"页的数据处理了"+str(j)+"次")   
            except urllib.request.URLError as e:
                if hasattr(e, "code"):
                    print(e.code)
                if hasattr(e, "reason"):
                    print(e.reason)
                # 如果是URLError错误就等待10秒    
                time.sleep(10)    
            except Exception as e:
                print("出现异常") 
                # 如果是Exception就等待1秒
                time.sleep(1)
    # 网页的尾部            
    html2 = '''</body>
              </html>'''
    dataEnd = open("D:/python/file/9.html", "ab")
    dataEnd.write(html2.encode('utf-8'))
    dataEnd.close()

# 代理设置
proxy1 = "344.48.5.154:8989"
proxy2 = "303.213.247.12:8989"
pageStart = 1
pageEnd = 2
# 搜索的关键字
keywords = "物联网"
listUrls = getListUrl(keywords, pageStart, pageEnd, proxy2)
getContent(listUrls, proxy2)

如果各位想用的话，请换上自己的代理ip。我这个免费的已经失效了。

转载于:https://my.oschina.net/quguangle/blog/1840774