LinkExtractor
from scrapy.linkextractors import LinkExtractor
Link
from scrapy.link import Link
Link四个属性
url text fragment nofollow
如果需要解析出文本,需要在 LinkExtractor 的参数中添加参数:attrs
link_extractor = LinkExtractor(attrs=('href','text'))
links = link_extractor.extract_links(response)
使用示例
import scrapy
from scrapy.linkextractors import LinkExtractor
class DemoSpider(scrapy.Spider):
name = 'spider'
start_urls = [
"https://book.douban.com/"
]
def parse(self, response):
# 参数是正则表达式
link_extractor = LinkExtractor(allow="https://www.tianyancha.com/brand/b.*")
links = link_extractor.extract_links(response)
for link in links:
print(link.text, link.url)
if __name__ == '__main__':
cmdline.execute("scrapy crawl spider".split())