有反爬,出现访问验证!穷人,手里没啥代理😂!
因为是著名大站,所以只贴出部分,防止侵权或者非法用途等等,仅供学习交流,我也是个菜!愁人!
class FtxSpider(scrapy.Spider):
name = 'ftx'
allowed_domains = ['fang.com']
start_urls = ['https://www.fang.com/SoufunFamily.htm']
def parse(self, response):
tr_list = response.xpath('//div[@id="c02"]/table/tr')
for tr in tr_list:
citys = tr.xpath('./td[last()]/a')
ids = tr.xpath('./@id').extract() #ids获取的是海外的房产信息,这部分不需要
for id in ids:
if id != "sffamily_B03_30":
for city in citys:
item = FangtianxiaItem()
item["city"] = city.xpath("./text()").extract_first()
city_url = city.xpath("./@href").extract_first().strip()
city_url = response.urljoin(city_url)
yield scrapy.Request(url=city_url, callback=self.two_parse, meta={"item": deepcopy(item)}, dont_filter=True)
def two_parse(self, response):
item = response.meta["item"]
two_url = response.xpath("//div[@class='s6']/div[@class='listBox']/ul/li/a/@href").extract_first()
#有的是直接跳到列表页,有的不是所以if else判断一下,不是所有的都一样,所以要多点开看看
if two_url is not None:
yield scrapy.Request(url=two_url, callback=self.list_page, meta={"item": deepcopy(item)}, dont_filter=True)
else:
yield scrapy.Request(url=response.url, callback=self.list_page, meta={"item": deepcopy(item)}, dont_filter=True)
def list_parse(self, response):
# item = response.meta['item']
dl_list = response.xpath('//div[@class="shop_list shop_list_4"]/dl')
for dl in dl_list:
time.sleep(3)
item = FangtianxiaItem()
item["title"] = dl.xpath('./dd/h4[@class="clearfix"]/a/span/text()').extract_first()
#有的字段有,有的字段没有,if一下吧
if item["title"]:
item["title"] = item["title"].strip()
item["structure"] = dl.xpath('./dd/p[1]/text()[1]').extract_first()
if item["structure"]:
item["structure"] = item["structure"].strip()
item["mm"] = dl.xpath('./dd/p[1]/text()[2]').extract_first()
if item["mm"]:
item["mm"] = item["mm"].strip()
item["height"] = dl.xpath('./dd/p[1]/text()[3]').extract_first()
if item["height"]:
item["height"] = item["height"].strip()
item["fangxiang"] = dl.xpath('./dd/p[1]/text()[4]').extract_first()
if item["fangxiang"]:
item["fangxiang"] = item["fangxiang"].strip()
item["year"] = dl.xpath('./dd/p[1]/text()[5]').extract_first()
if item["year"]:
item["year"] = item["year"].strip()
item["people"] = dl.xpath('./dd/p[1]/span/a/text()').extract_first()
if item["people"]:
item["people"] = item["people"].strip()
#这翻页真是个坑
next_one = response.xpath('//*[@id="list_D10_15"]/p[1]/a')
next_next = response.xpath('//*[@id="list_D10_15"]/p[3]/a')
if next_one.xpath("./@href").extract_first() and next_one.xpath("./text()").extract_first() == "下一页":
next_one_url = response.urljoin(next_one.xpath("./@href").extract_first())
print(next_one_url)
yield scrapy.Request(url=next_one_url, callback=self.list_parse, meta={'item': deepcopy(item)})
else:
if next_next.xpath("./@href").extract_first() and next_next.xpath("./text()").extract_first() == "下一页":
next_next_url = response.urljoin(next_next.xpath("./@href").extract_first())
print(next_next_url)
yield scrapy.Request(url=next_next_url, callback=self.list_parse, meta={'item': deepcopy(item)})