scrapy 抓取数据核心代码
最近突发奇想,想学习scrapy抓一些数据回来,纯属满足自己的心理。出于这个想法我就通过学习写了一个scrapy中爬虫代码,用来抓取数据的部分,供自己和新手学习研读,代码经过n个网站爬取测试,是完全可以拿来用的,但是注意 item里面的 数据需要填写正确
# -- coding: utf-8 --
import scrapy
from tur_spider.items import TurSpiderItem as Item
import os
from scrapy.http import Request
class SpiderTurSpider(scrapy.Spider):
name = ‘spider_tur’
this_doman = ‘hao123.com’
this_url = ‘https://www.hao123.com/’
allowed_domains = [this_doman]
start_urls = [this_url]
all_set = []
try:
file_path = "links.ini"
file_path1 = "info.ini"
with open(file_path,'r',encoding="utf-8") as f:
lines = f.readlines()
os.remove(file_path)
os.remove(file_path1)
i = 0
for line in lines:
i = i + 1
print(i)
line = line.replace('\n','')
if this_doman in line:
if line not in start_urls:
start_urls.append(line)
with open(file_path, 'a', encoding="utf-8") as f:
lines = f.write(line + '\n')
except:
print('error')
print('ok')
def parse(self, response):
##--------爬取p标签用的部分---------
lis = response.xpath('//p')
for li in lis:
item = Item()
# item['info'] = li.xpath('./descendant::text()').extract()
item['info'] = li.xpath('./text()').extract()
yield item
###--------爬取a标签用的部分---------
lis1 = response.xpath('//a')
for li in lis1:
item = Item()
item['info'] = li.xpath('./descendant::text()').extract()
# item['link'] = ""
yield item
###--------爬取span标签用的部分---------
lis2 = response.xpath('//span')
for li in lis2:
item = Item()
item['info'] = li.xpath('./descendant::text()').extract()
# item['link'] = ""
yield item
# # # # # # # # ---------------------- class="content"
# lis_div_content = response.xpath('//div[@class="content"]')
# for info in lis_div_content:
# item = Item()
# item['new'] = info.xpath('./descendant::text()').extract()
# yield item
# # # # #--------爬取url用的部分---------/html/body/table[1]/tbody/tr/td/table[3]/tbody/tr[1]/td/div[4]/div/div/table[2]/tbody/tr[2]/td/a
lis3 = response.xpath('//a')
for li in lis3:
try:
item = Item()
tmp_url = li.xpath('./@href').extract()[0]
link = ""
if 'http' not in str(tmp_url):
# 多一行判断,以防止抓到别的网站数据内容
link = "https://www.superhaber.tv/" + str(tmp_url)
else:
link = tmp_url
if self.this_doman in link:
if link not in self.all_set:
self.all_set.append(link)
yield Request(link, callback=self.parse)
item['link'] = link
yield item
except Exception as e:
print('error happen',e)