Python2.7利用xpath爬取韩寒博客（多线程版）

最新推荐文章于 2018-12-08 13:54:43 发布

weixin_33924220

最新推荐文章于 2018-12-08 13:54:43 发布

阅读量145

点赞数

文章标签： python

原文链接：https://juejin.im/post/5a6058d6518825733e6081ad

版权

萌新自己玩，大佬勿喷

# -*- coding: UTF-8 -*-
#参考资料：https://segmentfault.com/q/1010000004879947
import requests
from lxml import html
import os
import time
import datetime
import threading

def requestPageText(pageUrl):
    n_page = requests.get(pageUrl)
    # time.sleep(2)
    n_page.encoding = 'utf-8'#让页面的编码为utf8,如果没有这个标题就乱码
    q = n_page.text.encode('utf-8')#只写了这个对标题是没有用的，原因还不明白
    n_tree = html.fromstring(q)
    #获取URL数组
    urlArr = n_tree.xpath('//div[@class="articleCell SG_j_linedot1"]/p[@class="atc_main SG_dot"]/span[@class="atc_title"]/a/@href')
    #获取标题数组
    titleArr = n_tree.xpath('//div[@class="articleCell SG_j_linedot1"]/p[@class="atc_main SG_dot"]/span[@class="atc_title"]/a/text()')
    print titleArr[0]
    for i in range(len(urlArr)):
        textPage = requests.get(urlArr[i])
        textPage.encoding = 'utf-8'
        textTree = html.fromstring(textPage.text)
        textArray = textTree.xpath('//div[@id="sina_keyword_ad_area2"]')
        e = textArray[0]
        t = threading.Thread(target = savePage,args=(e,titleArr))
        t.start()
        t.join()

def savePage(e,titleArr):
    t = e.xpath('string(.)')
    write = open('/Users/jinniu/desktop/aaap/a.txt','a')
    write.write("\r\n")  
    write.write((titleArr[i]).encode('utf8'))
    write.write(t.encode('utf8'))
    write.close()


if __name__ == '__main__':  
    start = time.clock()
    path1 = '/Users/jinniu/desktop/aaap'
    if not os.path.exists(path1):
        os.makedirs(path1)


    for i in range(1,8):
        print '------->http://blog.sina.com.cn/s/articlelist_1191258123_0_%s.html' %i
        requestPageText('http://blog.sina.com.cn/s/articlelist_1191258123_0_%s.html' %i)
    end = time.clock()
    print end-start
    
复制代码

weixin_33924220

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Python2.7利用xpath爬取韩寒博客（多线程版）

萌新自己玩，大佬勿喷# -*- coding: UTF-8 -*-#参考资料：https://segmentfault.com/q/1010000004879947import requestsfrom lxml import htmlimport osimport timeimport datetimeimport threadingdef requestPageText(p...
复制链接

扫一扫