萌新自己玩,大佬勿喷
# -*- coding: UTF-8 -*-
#参考资料:https://segmentfault.com/q/1010000004879947
import requests
from lxml import html
import os
import time
import datetime
import threading
def requestPageText(pageUrl):
n_page = requests.get(pageUrl)
# time.sleep(2)
n_page.encoding = 'utf-8'#让页面的编码为utf8,如果没有这个标题就乱码
q = n_page.text.encode('utf-8')#只写了这个对标题是没有用的,原因还不明白
n_tree = html.fromstring(q)
#获取URL数组
urlArr = n_tree.xpath('//div[@class="articleCell SG_j_linedot1"]/p[@class="atc_main SG_dot"]/span[@class="atc_title"]/a/@href')
#获取标题数组
titleArr = n_tree.xpath('//div[@class="articleCell SG_j_linedot1"]/p[@class="atc_main SG_dot"]/span[@class="atc_title"]/a/text()')
print titleArr[0]
for i in range(len(urlArr)):
textPage = requests.get(urlArr[i])
textPage.encoding = 'utf-8'
textTree = html.fromstring(textPage.text)
textArray = textTree.xpath('//div[@id="sina_keyword_ad_area2"]')
e = textArray[0]
t = threading.Thread(target = savePage,args=(e,titleArr))
t.start()
t.join()
def savePage(e,titleArr):
t = e.xpath('string(.)')
write = open('/Users/jinniu/desktop/aaap/a.txt','a')
write.write("\r\n")
write.write((titleArr[i]).encode('utf8'))
write.write(t.encode('utf8'))
write.close()
if __name__ == '__main__':
start = time.clock()
path1 = '/Users/jinniu/desktop/aaap'
if not os.path.exists(path1):
os.makedirs(path1)
for i in range(1,8):
print '------->http://blog.sina.com.cn/s/articlelist_1191258123_0_%s.html' %i
requestPageText('http://blog.sina.com.cn/s/articlelist_1191258123_0_%s.html' %i)
end = time.clock()
print end-start
复制代码