python3 爬虫新手笔记(三):资讯类网页爬取
一. dapp.tech网站举例
新闻资讯类的网页内存在文本和图片(超链接),爬取其中标题、文本、图片、类别等信息。文本和图片可以分别提取和存储。出于一种瞎折腾的想法,在不下载和存储图片本身的前提下,我打算一起提取文本和图片,使图片的描述和超链接依然镶嵌在文本中的原位置。实现中使用了递归,主要是熟悉和练习了xpath的使用。
def start_requests(self):
return [Request("https://dapp.tech/cloudflare-introduces-ipfs-gateway/", callback=self.parse_article)]
#列表的第一页有一篇文章,将这篇文章也爬取下来
def first_page(self, response):
item = DapptechItem()
#通过response得到该页面的html文本和url地址
item['html'] = response.body
item['url'] = response.url
......
#print(item)
yield item
#得到第一页文章列表的文章地址
blocks = response.xpath('.//div[@class="block-article bdaiaFadeIn"]')
for block in blocks:
article_href = block.xpath('article/header/h2[@class="entry-title"]/a/@href').extract_first()
if article_href is not None:
yield Request(article_href, callback=self.parse_article)
#得到下一页文章列表的地址
div_pagination = response.xpath('.//div[@class="bdaia-pagination"]')
if div_pagination is not None:
childs = div_pagination.xpath('*')
child_count = 0;
for child in childs:
child_class = child.xpath('@class').extract_first()
child_count += 1
if child_class=='current':
break
next_page_href = childs[child_count].xpath('@href').extract_first()
print(next_page_href)
if next_page_href is not None:
yield Request(next_page_href, callback=self.article_list)
#处理第二页之后的的文章列表,得到文章地址的方法同first_page()
def article_list(self, response):
......
#爬取具体文章
def parse_article(self, response):
item = DapptechItem()
item['html'] = response.body
item['url'] = response.url
......
post_content = response.xpath('.//div[@class="bdaia-post-content"]')
if post_content is not None:
content_list = []
first_div = post_content.xpath('div[1]')
if first_div is not None:
proNest(content_list, first_div)
#'*'得到该节点的所有元素节点
childs = post_content.xpath('*')
for child in childs[2:]:
#调用递归结构处理
proNest(content_list, child)
"""
print("attrs: ")
attrs = child.xpath('.//@href | .//@src')
for attr in attrs:
print(attr)
"""
item['content'] = content_list
#print("content:", content_list)
......
yield item
#递归结构
def proNest(text, selector):
#'node()'得到该节点的所有节点,与'*'不同,包含纯文本之类的
childs = selector.xpath('node()')
for child in childs:
#'name()'得到该节点的名称
child_name = child.xpath('name()').extract_first()
#如果名称为空是纯文本,直接提取内容
if child_name is None:
text_content = child.extract()
if len(text_content)==0:
continue
#print(text_content)
text.append(text_content)
#如果是img、iframe、a、link或其他节点
else:
if child_name=='img':
img_content = "<img>" + child.xpath('@src').extract_first() + "</img>"
img_text = child.xpath('text()').extract_first()
if img_text is not None:
img_content += img_text
#print(img_content)
text.append(img_content.strip())
elif child_name=='iframe':
iframe_content = "<iframe>" + child.xpath('@src').extract_first() + "</iframe>"
iframe_text = child.xpath('text()').extract_first()
if iframe_text is not None:
iframe_content += iframe_text
#print(iframe_content)
text.append(iframe_content.strip())
elif child_name=='a':
a_content = "<a>" + child.xpath('@href').extract_first() + "</a>"
a_text = child.xpath('text()').extract_first()
if a_text is not None:
a_content += a_text
#print(a_content)
text.append(a_content.strip())
elif child_name=='link':
link_content = "<link>" + child.xpath('@href').extract_first() + "</link>"
link_text = child.xpath('text()').extract_first()
if link_text is not None:
link_content += link_text
#print(link_content)
text.append(link_content.strip())
#节点的子节点中可能含有img、iframe、a、link
else:
links = child.xpath('.//a | .//link')
srcs = child.xpath('.//img | .//iframe')
#如果没有img、iframe、a、link节点,直接提取内容;否则递归
if len(links)==0 and len(srcs)==0:
content = child.xpath('.//text()').extract()
if len(content)==0:
continue
#print(content)
text += content
else:
proNest(text, child)