这个网站是js渲染过的,所以我们可以使用PhantomJS浏览器或者在network中找出需要post的qurrystring中的参数,发请求就可以了,得到的是json
# !/usr/bin/python
# -*- encoding: UTF-8 -*-
from lxml import etree
import urllib
import urllib2
import jsonpath
import json
from lxml import etree
class we():
def __init__(self):
self.page=3
self.headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",}
def meiyiye(self):
item=[]
headers=self.headers
url="http://search.jiayuan.com/v2/search_v2.php"
fromdata={
"sex":"f",
"key":"",
"stc":"1%3A41%2C2%3A19.27%2C3%3A155.170%2C23%3A1",
"sn":"default",
"sv":"1",
"p":self.page,#表示爬取的第几页
"f":"",
"listStyle":"bigPhoto",
"pri_uid":"170703614",
"jsversion":"v5"}
data = urllib.urlencode(fromdata)
request=urllib2.Request(url,data=data,headers=headers)#post请求的话需要data值,而get请求不需要data有值
response = urllib2.urlopen(request)
#得到的是json格式的字符串(字典行的)
html = response.read()
html1=html.replace("##jiayser##","").replace("//","")
#把json转换成python格式的unicode字符串(列表形式的)
content = json.loads(html1)
id_list=jsonpath.jsonpath(content,"$..uid")#在content中匹配出需要的个人id,然后通过这个id拼接出个人主页的链接
for i in id_list:
item.append(i)
self.page+=1
self.meiyigeren(item)
#处理这一页每一个人的页面信息
def meiyigeren(self,item):
for id in item:
print "*******************************************"
print u"用户id:"+str(id)
url="http://www.jiayuan.com/"+str(id)+"?fxly=search_v2_index"#拼接连接,然后发送请求,找到个人主页中需要的有用的内容
print u"主页链接:"+url
headers=self.headers
request=urllib2.Request(url,headers=headers)
response = urllib2.urlopen(request)
html = response.read()
content=etree.HTML(html)#解析HTML文档为HTML DOM模型,然后下面就可以使用xpath匹配出想要的内容
username=content.xpath('//div[@class="main_1000 bg_white mt15"]//h4/text()')
if len(username)==1:
print username[0]
else:
print u"没有名字"
a=content.xpath('//div[@class="main_1000 bg_white mt15"]//h6[@class="member_name"]/text()')
we=" ".join(a)
ni=we.replace(","," ").replace(',',' ')
ha=ni.split(" ")
print u"年龄:"+ha[0]
header_url=content.xpath('//div[@class="big_pic fn-clear"]//li[2]//tr//img[@class="img_absolute"]//@_src')
if len(header_url)==1:
header_urll=header_url[0]
else:
header_urll=u"没有头像链接:"
print u"头像链接:"+header_urll
image_url=content.xpath('//div[@class="small_pic_box fn-clear"]//div[@class="small_pic fn-clear"]//li//img//@src')
print u"相册链接:",
print image_url
content1=content.xpath('//div[@class="main_1000 mt15 fn-clear"]//div[@class="bg_white"]//div[@class="js_text"]//text()')
content2=""
for i in content1:
content2+=i
content3=content2
print u"内心独白:"+content3.strip()
place=content.xpath('//div[@class="main_1000 bg_white mt15"]//h6[@class="member_name"]/a[2]/text()')
if len(place)==1:
where=place[0]
else:
where=u"河南"
print u"来自:"+where+u"省"
xueli=content.xpath('//div[@class="main_1000 bg_white mt15"]//ul[@class="member_info_list fn-clear"]/li[1]//div[@class="fl pr"]/em/text()')
if len(xueli)==1:
print u"学历:"+xueli[0]
else:
print u"学历:本科"
print "***********************************************"
if self.page<=5:
self.meiyiye()
if __name__=="__main__":
ni=we()
ni.meiyiye()
爬取js渲染过的页面(爬取一个婚庆网站为例)
最新推荐文章于 2023-07-22 22:51:59 发布