今天爬取中彩网福彩3d[http://www.zhcw.com/3d/]的时候,碰到iframe嵌套,xpath始终取不到值,如下图:
无论怎么取值,都为null,后来发现有个这个东西
然后直接进入到url里面,就可以取到值了
好了,问题解决,查阅网上资料,听说可以正面攻克,比较麻烦,不推荐花时间去做这东西。
最后附上本人代码,爬虫框架用的是scrapy,存储用的MySQL数据库。
items
import scrapy
class Lottery3DItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 开奖日期
date = scrapy.Field()
# 期号
issue = scrapy.Field()
# 第一个蓝球号码
blue1 = scrapy.Field()
# 第二个蓝球号码
blue2 = scrapy.Field()
# 第三个蓝球号码
blue3 = scrapy.Field()
spider
# -*- coding: utf-8 -*-
import scrapy
from ..items import Lottery3DItem
class LotterySpider(scrapy.Spider):
name = 'lottery'
allowed_domains = ['zhcw.com']
start_urls = ['http://kaijiang.zhcw.com/zhcw/html/3d/list_1.html']
index = 1
items = []
def parse(self, response):
node_list = response.xpath("//tr")
node_list.pop(0)
node_list.pop(0)
node_list.pop()
for node in node_list:
item = Lottery3DItem()
item["date"] = node.xpath("./td[1]/text()").extract_first()
item["issue"] = node.xpath("./td[2]/text()").extract_first()
item["blue1"] = node.xpath("./td[3]/em[1]/text()").extract_first()
item["blue2"] = node.xpath("./td[3]/em[2]/text()").extract_first()
item["blue3"] = node.xpath("./td[3]/em[3]/text()").extract_first()
yield item
self.index += 1
next_url = "http://kaijiang.zhcw.com/zhcw/html/3d/list_{}.html".format(self.index)
yield scrapy.Request(url=next_url, callback=self.parse)
pipeline
import pymysql
class Lottery3DPipeline(object):
def __init__(self):
self.conn = pymysql.connect(host='127.0.0.1', user='root', passwd='mysql', db='spider', charset='utf8')
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
lottery_date = item['date']
issue = item['issue']
blue1 = item['blue1']
blue2 = item['blue2']
blue3 = item['blue3']
sql = "insert into lottery_3d(date, issue, blue1, blue2, blue3) VALUES(%s, %s, %s, %s, %s)"
self.cursor.execute(sql, (lottery_date, issue, blue1, blue2, blue3,))
self.conn.commit()
return item
def close_spider(self, spider):
self.conn.close()