python3 爬虫新手笔记（三）：资讯类网页爬取

一. dapp.tech网站举例

新闻资讯类的网页内存在文本和图片（超链接），爬取其中标题、文本、图片、类别等信息。文本和图片可以分别提取和存储。出于一种瞎折腾的想法，在不下载和存储图片本身的前提下，我打算一起提取文本和图片，使图片的描述和超链接依然镶嵌在文本中的原位置。实现中使用了递归，主要是熟悉和练习了xpath的使用。

	def start_requests(self):
		return [Request("https://dapp.tech/cloudflare-introduces-ipfs-gateway/", callback=self.parse_article)]
		

    #列表的第一页有一篇文章，将这篇文章也爬取下来
	def first_page(self, response):
		item = DapptechItem()
        #通过response得到该页面的html文本和url地址
		item['html'] = response.body
		item['url'] = response.url

		......

		#print(item)
		yield item

        #得到第一页文章列表的文章地址
		blocks = response.xpath('.//div[@class="block-article bdaiaFadeIn"]')
		for block in blocks:
			article_href = block.xpath('article/header/h2[@class="entry-title"]/a/@href').extract_first()
			if article_href is not None:
				yield Request(article_href, callback=self.parse_article)

        #得到下一页文章列表的地址
		div_pagination = response.xpath('.//div[@class="bdaia-pagination"]')
		if div_pagination is not None:
			childs = div_pagination.xpath('*')
			child_count = 0;
			for child in childs:
				child_class = child.xpath('@class').extract_first()
				child_count += 1

				if child_class=='current':
					break
			
			next_page_href = childs[child_count].xpath('@href').extract_first()
			print(next_page_href)
			if next_page_href is not None:
				yield Request(next_page_href, callback=self.article_list)

	
	#处理第二页之后的的文章列表，得到文章地址的方法同first_page（）
	def article_list(self, response):
		......
	
	#爬取具体文章		
	def parse_article(self, response):
		item = DapptechItem()
		item['html'] = response.body
		item['url'] = response.url
		
        ......


		post_content = response.xpath('.//div[@class="bdaia-post-content"]')
		if post_content is not None:
			content_list = []

			first_div = post_content.xpath('div[1]')
			if first_div is not None:
				proNest(content_list, first_div)

            #'*'得到该节点的所有元素节点
			childs = post_content.xpath('*')
			for child in childs[2:]:
                #调用递归结构处理
				proNest(content_list, child)
				"""
				print("attrs: ")
				attrs = child.xpath('.//@href | .//@src')
				for attr in attrs:
					print(attr)
				"""
			item['content'] = content_list
			#print("content:", content_list)
		
		......

		yield item

#递归结构
def proNest(text, selector):
    #'node()'得到该节点的所有节点，与'*'不同，包含纯文本之类的
	childs = selector.xpath('node()')
	for child in childs:
        #'name()'得到该节点的名称
		child_name = child.xpath('name()').extract_first()

        #如果名称为空是纯文本，直接提取内容
		if child_name is None:
			text_content = child.extract()
			if len(text_content)==0:
				continue
			#print(text_content)
			text.append(text_content)
        #如果是img、iframe、a、link或其他节点
		else:
			if child_name=='img':
				img_content = "<img>" + child.xpath('@src').extract_first() + "</img>"
				img_text =  child.xpath('text()').extract_first()
				if img_text is not None:
					img_content += img_text
				#print(img_content)
				text.append(img_content.strip())
			elif child_name=='iframe':
				iframe_content = "<iframe>" + child.xpath('@src').extract_first() + "</iframe>"
				iframe_text =  child.xpath('text()').extract_first()
				if iframe_text is not None:
					iframe_content += iframe_text
				#print(iframe_content)
				text.append(iframe_content.strip())
			elif child_name=='a':
				a_content = "<a>" + child.xpath('@href').extract_first() + "</a>" 
				a_text = child.xpath('text()').extract_first()
				if a_text is not None:
					a_content += a_text
				#print(a_content)
				text.append(a_content.strip())
			elif child_name=='link':
				link_content = "<link>" + child.xpath('@href').extract_first() + "</link>" 
				link_text = child.xpath('text()').extract_first()
				if link_text is not None:
					link_content += link_text
				#print(link_content)
				text.append(link_content.strip())
            #节点的子节点中可能含有img、iframe、a、link
			else:
				links = child.xpath('.//a | .//link')
				srcs = child.xpath('.//img | .//iframe')

                #如果没有img、iframe、a、link节点，直接提取内容；否则递归
				if len(links)==0 and len(srcs)==0:
					content = child.xpath('.//text()').extract()
					if len(content)==0:
						continue
					#print(content)
					text += content
				else:
					proNest(text, child)