Python爬虫现在网上教程太多了,一些都是可以拿来用的,这篇博客的不同是介绍一个特别简单的爬虫,很多初学者可能都是会了的,涉及到文件读写,PhantomJS(模仿浏览器查看web界面),etree/xpath解析爬下来的网页,以下是正题:
给我的任务是爬虫抓下来所有中国省市区,并改成JS变量格式,思路是:北京**北京&&海淀区%%......,,我再把它用Sublime的Ctrl+H给改一下。
上代码:
#coding:utf-8
from selenium import webdriver
import urllib
import re
import zlib
from lxml import etree
import codecs
import sys,os,codecs
reload(sys)
sys.setdefaultencoding('utf-8')
base_url=“http://www.syxcn.com/”
msg_url=""
url=""
def DelLastChar(str):
str_list=list(str)
str_list.pop()
return "".join(str_list)
def yop():
for data_num in range(11,13):
data_url=base_url+str(data_num)+"/"
num=1
num_new=""
url_title=""
while num<20:
if num<10:
url=data_url+"0"+str(num)+"/"
num_new="0"+str(num)
url_title=base_url+str(data_num)+"/"+str(data_num)+"0"+str(num)+".html"
print "\t\n"
print url
msg_url=url
url=""
else:
url=data_url+str(num)+"/"
print "\t\n"
num_new=str(num)
url_title=base_url+str(data_num)+"/"+str(data_num)+str(num)+".html"
print url
msg_url=url
url=""
(这段代码的效果是生成很多:“http://www.syxcn.com/11/01/1101.html”)
一把锁了两天,发现了一个很严重的问题,这个是我随便找的网站:http://www.syxcn.com/,实在是没法用啊,延迟的卡的不行各种连不上,这怎么行?!
直到我又找了一个静态化处理了的网页,链接:http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/,好用的多。
使用PhantomJS,etree的xpath处理的代码:
def show_new_data(queue_web,msg_url,currect_url):
driver_web = webdriver.PhantomJS()
driver_web.get(currect_url)
data_web = driver_web.page_source
tree=etree.HTML(data_web)
node_new_web=tree.xpath(u"/html/body/table/tbody/tr/td/table/tbody/tr/td/table/tbody/tr/td/table/tbody/tr/td/a[1]/text()")
web_msg=""
node_new_web=node_new_web[1:]
for web_msg in node_new_web:
print web_msg
fd=codecs.open('topic01.txt','a','utf-8')
fd.writelines('Submit\r\n')
fd.writelines(str(web_msg).decode('utf-8'))
fd.writelines('document\r\n')
fd.close()
print "This is a line "
res=get_grade(queue_web,msg_url)
print resgi
使用数字for...range(0,len(...))得到的url,爬取数据格式化输出。
搞定。
以下是全部代码:
#我把get_data的数值改成11,13,其实这个应该是11,,65的
#coding:utf-8
from selenium import webdriver
import urllib
import re
import zlib
from lxml import etree
import codecs
import sys,os,codecs
reload(sys)
sys.setdefaultencoding('utf-8')
base_url="http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/"
msg_url=""
url=""
def DelLastChar(str):
str_list=list(str)
str_list.pop()
return "".join(str_list)
def get_data():
for data_num in range(11,13):
data_url=base_url+str(data_num)+"/"
num=1
num_new=""
url_title=""
while num<20:
if num<10:
url=data_url+"0"+str(num)+"/"
num_new="0"+str(num)
url_title=base_url+str(data_num)+"/"+str(data_num)+"0"+str(num)+".html"
print "\t\n"
print url
msg_url=url
url=""
else:
url=data_url+str(num)+"/"
print "\t\n"
num_new=str(num)
url_title=base_url+str(data_num)+"/"+str(data_num)+str(num)+".html"
print url
msg_url=url
url=""
set_msg_num=1
while set_msg_num<99:
if set_msg_num<10:
set_num="0"+str(set_msg_num)
new_url=msg_url+str(data_num)+str(num_new)+str(set_num)+".html"
print "\t\n"
new_msg_url=msg_url+str(data_num)+str(num_new)+str(set_num)
print new_url
print msg_url
res=show_new_data(new_url,msg_url,url_title)
print res
else:
new_url=msg_url+str(data_num)+str(num_new)+str(set_msg_num)+".html"
new_msg_url=msg_url+str(data_num)+str(num_new)+str(set_num)
print "\t\n"
print new_url
print msg_url
res=show_new_data(new_url,msg_url,url_title)
print res
set_msg_num=set_msg_num+1
num=num+1
print 'finally'
def get_grade(data_url,data_url_set):
service_args = ['--proxy=localhost:8889','--proxy-type=socks5']
#driver = webdriver.PhantomJS(executable_path="/")
driver = webdriver.PhantomJS()
driver.get(data_url)
data = driver.page_source
print data
tree=etree.HTML(data)
# node_data=tree.xpath(u"//tr[@class='towntr']//td//a//text()")
print tree
node_data=tree.xpath(u"/html/body/table/tbody/tr/td/table/tbody/tr/td/table/tbody/tr/td/table/tbody/tr/td/a[1]/text()")
node_data_url=tree.xpath(u"/html/body/table/tbody/tr/td/table/tbody/tr/td/table/tbody/tr/td/table/tbody/tr/td[1]/a[1]/@href")
# print node_data_url
print ".........................................................."
msg_node=[]
for queue_msg in node_data_url:
new_queue_msg=data_url_set+queue_msg
print new_queue_msg
print len(node_data_url)
msg_node.append(new_queue_msg)
node=[]
node_msg=""
for node_msg in node_data:
node.append(node_msg)
data1=[]
data2=[]
for web_msg in node:
if str(len(web_msg))=='12':
pass
else:
#http://swdxw/
data2.append(web_msg)
# fd=codecs.open('topic01.txt','a','utf-8')# fd.writelines(str(web_msg).decode('utf-8'))# fd.close()
test_doc=""
new_test_msg=[]
for test_doc in data1:
test_msg=data_url_set+test_doc
new_test_msg.append(test_msg)
for num_web in range(0,len(data2)):
web_msg=data2[num_web]
fd=codecs.open('topic01.txt','a','utf-8')
fd.writelines('*\r\n')
fd.writelines('*\r\n')
fd.writelines(str(web_msg).decode('utf-8'))
fd.writelines('@\r\n')
fd.close()
print msg_node[num_web]
print "llllllllllllllllllllllllllllll"
show_data(str(msg_node[num_web]))
return 'OK'
def show_new_data(queue_web,msg_url,currect_url):
driver_web = webdriver.PhantomJS()
driver_web.get(currect_url)
data_web = driver_web.page_source
tree=etree.HTML(data_web)
node_new_web=tree.xpath(u"/html/body/table/tbody/tr/td/table/tbody/tr/td/table/tbody/tr/td/table/tbody/tr/td/a[1]/text()")
web_msg=""
node_new_web=node_new_web[1:]
for web_msg in node_new_web:
print web_msg
fd=codecs.open('topic01.txt','a','utf-8')
fd.writelines('Submit\r\n')
fd.writelines(str(web_msg).decode('utf-8'))
fd.writelines('document\r\n')
fd.close()
print "This is a line "
res=get_grade(queue_web,msg_url)
print res
def show_data(queue_web):
driver_web = webdriver.PhantomJS()
driver_web.get(queue_web)
data_web = driver_web.page_source
tree=etree.HTML(data_web)
node_new_web=tree.xpath(u"/html/body/table/tbody/tr/td/table/tbody/tr/td/table/tbody/tr/td/table/tbody/tr/td/text()")
# web_msg=node_new_web
web_msg=""
for web_msg in node_new_web:
# print web_msg
if str(len(web_msg))=='12':
pass
else:
if str(len(web_msg))=='3':
pass
else:
print web_msg
fd=codecs.open('topic01.txt','a','utf-8')
fd.writelines(str(web_msg).decode('utf-8'))
fd.close()
data_url="http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/46/90/469029.html"
data_url_set="http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/46/90/"
#new_res=get_grade(data_url,data_url_set)
#print new_res
data_url_set_end="http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/46/90/29/469029104.html"
#res=show_data(data_url_set_end)
#print res
res=get_data()
#res=sub_num()
print res