爬取js渲染过的页面（爬取一个婚庆网站为例）

本文链接：https://blog.csdn.net/wanghandou/article/details/78606665

  
  
   
   这个网站是js渲染过的，所以我们可以使用PhantomJS浏览器或者在network中找出需要post的qurrystring中的参数，发请求就可以了，得到的是json
  
  
  
  
   
   # !/usr/bin/python
# -*- encoding: UTF-8 -*-
from lxml import etree
import urllib
import urllib2
import jsonpath
import json
from lxml import etree
class we():
	def __init__(self):
		self.page=3
		self.headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",}
	def meiyiye(self):
		item=[]
		headers=self.headers
		url="http://search.jiayuan.com/v2/search_v2.php"
		fromdata={
		"sex":"f",
		"key":"",
		"stc":"1%3A41%2C2%3A19.27%2C3%3A155.170%2C23%3A1",
		"sn":"default",
		"sv":"1",
		"p":self.page,#表示爬取的第几页
		"f":"",
		"listStyle":"bigPhoto",
		"pri_uid":"170703614",
		"jsversion":"v5"}
		data = urllib.urlencode(fromdata)
		request=urllib2.Request(url,data=data,headers=headers)#post请求的话需要data值，而get请求不需要data有值
		response = urllib2.urlopen(request)
		#得到的是json格式的字符串（字典行的）
		html = response.read()
		html1=html.replace("##jiayser##","").replace("//","")
		#把json转换成python格式的unicode字符串（列表形式的）
		content = json.loads(html1)
		id_list=jsonpath.jsonpath(content,"$..uid")#在content中匹配出需要的个人id，然后通过这个id拼接出个人主页的链接
		for i in id_list:
			item.append(i)
		self.page+=1
		self.meiyigeren(item)

#处理这一页每一个人的页面信息
	def meiyigeren(self,item):
		for id in item:
			print "*******************************************"
			print u"用户id:"+str(id)
			url="http://www.jiayuan.com/"+str(id)+"?fxly=search_v2_index"#拼接连接，然后发送请求，找到个人主页中需要的有用的内容
			print u"主页链接:"+url
			headers=self.headers
			request=urllib2.Request(url,headers=headers)
			response = urllib2.urlopen(request)
			html = response.read()
			content=etree.HTML(html)#解析HTML文档为HTML DOM模型，然后下面就可以使用xpath匹配出想要的内容
			username=content.xpath('//div[@class="main_1000 bg_white mt15"]//h4/text()')
			if len(username)==1:
				print username[0]
			else:
				print u"没有名字"
			a=content.xpath('//div[@class="main_1000 bg_white mt15"]//h6[@class="member_name"]/text()')
			we=" ".join(a)
			ni=we.replace("，"," ").replace(',',' ')
			ha=ni.split(" ")
			print u"年龄:"+ha[0]
			header_url=content.xpath('//div[@class="big_pic fn-clear"]//li[2]//tr//img[@class="img_absolute"]//@_src')
			if len(header_url)==1:
				header_urll=header_url[0]
			else:
				header_urll=u"没有头像链接:"
			print u"头像链接:"+header_urll
			image_url=content.xpath('//div[@class="small_pic_box fn-clear"]//div[@class="small_pic fn-clear"]//li//img//@src')
			print u"相册链接:",
			print image_url
			content1=content.xpath('//div[@class="main_1000 mt15 fn-clear"]//div[@class="bg_white"]//div[@class="js_text"]//text()')
			content2=""
			for i in content1:
				content2+=i
			content3=content2
			print u"内心独白:"+content3.strip()
			place=content.xpath('//div[@class="main_1000 bg_white mt15"]//h6[@class="member_name"]/a[2]/text()')
			if len(place)==1:
				where=place[0]
			else:
				where=u"河南"
			print u"来自:"+where+u"省"
			xueli=content.xpath('//div[@class="main_1000 bg_white mt15"]//ul[@class="member_info_list fn-clear"]/li[1]//div[@class="fl pr"]/em/text()')
			if len(xueli)==1:
				print u"学历:"+xueli[0]
			else:
				print u"学历:本科"
			print "***********************************************"
		if self.page<=5:
			self.meiyiye()
if __name__=="__main__":
	ni=we()
	ni.meiyiye()