#coding=utf-8
from gevent import monkey,pool
monkey.patch_all()
import os
import gevent
from lxml import etree
import urllib2
import time
jobs=[]
links=[]
p=pool.Pool(30)
urls=[]
f=open('d:\\nlx_author.txt','w')
'''
def get_links(url):
r=urllib2.urlopen(url).read()
html=etree.HTML(r)
results=html.xpath('//td[@id="title"]//a')
k+=1
for txt in results:
f.write(txt.text.encode('utf-8')+'\n')
'''
def get_links(url):
r=urllib2.urlopen(url).read()
html=etree.HTML(r)
result=html.xpath('//div[@class="viewbox"]//p//text()') #重点理解
for x in result:
f.write(x.encode('utf8'))
f.write('\n\n')
#f.write(result[0].xpath('string(.)').encode('utf-8'))
root_url='http://www.nlx.gov.cn/inter/'
for i in range(2,3): #测试20页,实际有965页
ur='http://www.nlx.gov.cn/inter/?tid=&pages=%d'%i
html=urllib2.urlopen(ur).read()
txt=etree.HTML(html)
results=txt.xpath('//td[@id="title"]//a/@href')
for r in results:
urls.append(root_url+r)
print time.strftime('%H:%M:%S')
for url in urls:
jobs.append(p.spawn(get_links,url))
gevent.joinall(jobs)
print time.strftime('%H:%M:%S')
f.close()
xpath text用法
最新推荐文章于 2023-07-09 15:36:13 发布