# -*- coding: utf-8 -*-
import scrapy
import jsonpath
import json
from ..items import YuehuiItem
class YuehuiSpider(scrapy.Spider):
name = 'yuehui'
allowed_domains = ['163.com']
start_urls = ['http://yuehui.163.com/searchusers.do']
base_url = 'http://yuehui.163.com/searchusersrcm.do?ajax=1&ageBegin=18&ageEnd=25&aim=-1&marriage=0&mode=4&order=8&province=22&city=0&district=-1&sex=0&userTag=0&vippage=-1&searchType=0&page={}&pagesize=8'
def parse(self, response):
for i in range(1,3):
fulurl = self.base_url.format(i)
print(fulurl)
yield scrapy.Request(url=fulurl,callback=self.parsePage)
def parsePage(self,response):
# print(response.text)
id_list = jsonpath.jsonpath(json.loads(response.text),'$..id')
base_url = 'http://yuehui.163.com/viewuser.do?id={}'
for id in id_list:
# print(id)
info_url = base_url.format(id)
# print(info_url)
yield scrapy.Request(url=info_url,callback=self.parseDeta)
def parseDeta(self,response):
item = YuehuiItem()
# print(response.xpath('//p[@class="nick"]/text()').extract()[0])
name = response.css('p.nick::text').extract()[0]
# print(name)
# 详情区域选择
info = response.css('ul.infolist li')
# 提取性别
sex = info[0].xpath('.//text()').extract()[0].strip('性别:')
# 提取婚姻状态
marriage = info[1].xpath('.//text()').extract()[0].split(':')[1]
# 年龄
age = info[2].xpath('.//text()').extract()[0].split(':')[1]
# 学历
degree = info[3].xpath('.//text()').extract()[0].split(':')[1]
# 地区
location = info[4].xpath('./p[1]/span[2]/text()').extract()[0]
# 职业
job = info[4].xpath('./p[2]/span/text()').extract()
if len(job) == 2:
job = job[1]
else:
job = '未透露'
# 身高
height = info[5].xpath('.//text()').extract()[0].split(':')[1]
# 体重
weight = info[7].xpath('.//text()').extract()[0].split(':')[1]
# 星座
xingzuo = info[9].xpath('.//text()').extract()[0].split(':')[1]
# 生肖
shengxiao = info[11].xpath('.//text()').extract()[0].split(':')[1]
# print(name,sex,marriage,age,degree,location,job,height,weight,xingzuo,shengxiao)
item['name'] = name
item['sex'] = sex
item['marriage'] = marriage
item['age'] = age
item['degree'] = degree
item['location'] = location
item['job'] = job
item['height'] = height
item['weight'] = weight
item['xingzuo'] = xingzuo
item['shengxiao'] = shengxiao
yield item
约会网 scrapy-spider
最新推荐文章于 2019-09-07 15:57:39 发布