兄弟连爬虫
import scrapy
from py10.items import TeacherItem # 导入数据模型
class XdlSpider(scrapy.Spider):
name = 'xdl2'
allowed_domains = ['itxdl.cn']
start_urls = ['http://www.itxdl.cn/activity/teacher/teacher_lieibiao/']
def parse(self, response):
# 获取所有div
# teacher_list = response.xpath('//div[@class="php_jiangshi_liebiao"]')
# 获取所有老师div
teacher_list = response.css('div.php_jiangshi_liebiao')
for teacher in teacher_list:
item = TeacherItem()
# # 老师名称
# xpath 写法
# name = teacher.xpath('.//h1/text()').extract()[0] # 返回列表
# industry = teacher.xpath('.//p/text()').extract()[0]
# image = teacher.xpath('.//img/@src').extract()[0]
# css 写法
name = teacher.css('h1::text').extract()[0]
industry = teacher.css('p::text').extract()[0]
image = teacher.css('img::attr(src)').extract()[0]
# 加载数据
item['name'] = name
item['industry'] = industry
item['image'] = image
# 把item交给 itempipeline (管道文件)
yield item
pipeline
import json
class Py10Pipeline(object):
# 管道文件需要实现的方法
def process_item(self, item, spider):
return item
class TeacherPipeline(object):
def __init__(self):
self.f = open('teacher.json','w', encoding='utf-8')
def process_item(self, item, spider):
# 写入文件
item['spider'] = spider.name # 获取爬虫名称
self.f.write(json.dumps(dict(item),ensure_ascii=False) + '\n')
# 返回item,继续交给其它的管道文件处理
return item
# 爬虫结束,执行
def close_spider(self,spider):
self.f.close()
腾讯招聘
class TencentSpider(scrapy.Spider):
name = 'tencent' # 爬虫名称
allowed_domains = ['tencent.com'] # 有效域
start_urls = ['http://hr.tencent.com']
base_url = 'http://hr.tencent.com/position.php?&start=%d'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
# 针对这个爬虫的配置文件
custom_settings = {
'ITEM_PIPELINES' : {
'day12.pipelines.TencentPipeline': 1,
},
'DOWNLOAD_DELAY' : 0,
'CONCURRENT_REQUESTS' : 32,
}
def parse(self, response):
# 构建所有分页请求
for i in range(0,2910 + 1,10):
fullurl = self.base_url % i
yield scrapy.Request(url=fullurl,callback=self.parseList,headers=self.headers)
# 解析列表页
def parseList(self,response):
detail_urls = response.css('tr.even a::attr(href),tr.odd a::attr(href)').extract()
for url in detail_urls:
fullurl = 'http://hr.tencent.com/' + url
yield scrapy.Request(url=fullurl,callback=self.parseDetail,headers=self.headers)
# 解析详情页
def parseDetail(self,response):
item = TencentItem()
title = response.xpath('//td[@id="sharetitle"]/text()').extract()[0]
info = response.xpath('//table//tr[2]/td/text()').extract()
location = info[0]
p_type = info[1]
number = info[2].strip('人')
duty = response.xpath('//table//tr[3]//li/text()').extract()
duty = ''.join(duty)
requirement = response.xpath('//table//tr[4]//li/text()').extract()
requirement = ''.join(requirement)
item["title"] = title
item["location"] = location
item["p_type"] = p_type
item["number"] = number
item["duty"] = duty
item["requirement"] = requirement
# 交给管道文件
yield item
class TencentItem(scrapy.Item):
title = scrapy.Field()
location = scrapy.Field()
p_type = scrapy.Field()
number = scrapy.Field()
duty = scrapy.Field()
requirement = scrapy.Field()
class TencentPipeline(object):
def __init__(self):
self.f = open('position.json','w',encoding='utf-8')
def process_item(self,item,spider):
self.f.write(json.dumps(dict(item), ensure_ascii=False) + '\n')
return item
def close_spider(self,spider):
self.f.close()
其他的
class TaobaoItem(scrapy.Item):
title = scrapy.Field()
price = scrapy.Field()
sales = scrapy.Field()
img_url = scrapy.Field()
class StoreImagePipeline(object):
def process_item(self, item, spider):
img_url = item['img_url']
img_url = 'http:' + img_url
import requests
response = requests.get(img_url)
if not os.path.exists('download'):
os.mkdir('download')
filename = 'download/' + img_url.split('/')[-1]
with open(filename, 'wb') as f:
f.write(response.content)
return item
class ImgsrcPipeline(object):
def process_item(self, item, spider):
img_url = item['src']
import requests
response = requests.get(img_url)
if not os.path.exists('tuchong'):
os.mkdir('tuchong')
filename = 'tuchong/' + img_url.split('/')[-1]
with open(filename, 'wb') as f:
f.write(response.content)
return item
from scrapy.cmdline import execute
execute('scrapy crawl bj58'.split())
from scrapy.cmdline import execute
execute('scrapy crawl qiushibaike'.split())
from scrapy.cmdline import execute
execute('scrapy crawl taobao'.split())
from scrapy.cmdline import execute
execute('scrapy crawl tuchong'.split())
class Bj58Spider(scrapy.Spider):
name = 'bj58'
allowed_domains = ['bj.58.com']
start_urls = ['http://bj.58.com/ershouche/?PGTID=0d100000-0000-1ce3-e602-1acefd5f07af&ClickID=4']
def parse(self, response):
# with open('58.html','wb') as f:
# f.write(response.body)
li_list = response.xpath('//ul[@class="car_list ac_container"]/li')
# print(li_list)
for li_ele in li_list:
title = li_ele.xpath('./div[@class="col col2"]/a/h1//text()').extract()
title = ''.join(title).strip()
# print(title)
price = li_ele.xpath('./div[@class="col col3"]/h3/text()').extract_first()
# print(price)
param = li_ele.xpath('./div[@class="col col2"]/div[@class="info_param"]/span/text()').extract()
param = ''.join(param)
# print(param)
tags = li_ele.xpath('./div[@class="col col2"]/div[@class="info_tags"]/div//text()').extract()
tags = ''.join(tags).strip()
print('=========='*30)
import scrapy
class QiushibaikeSpider(scrapy.Spider):
name = 'qiushibaike'
allowed_domains = ['qiushibaike.com']
start_urls = ['https://www.qiushibaike.com/']
def parse(self, response):
# with open('qiushi.html','wb') as f:
# f.write(response.body)
authors = response.xpath('//div[@class="author clearfix"]/a[2]/h2/text()').extract()
print(len(authors))
for i in range(0,len(authors)):
author = authors[i]
# -*- coding: utf-8 -*-
import scrapy
import re
from scrapy_project.items import TaobaoItem
class TaobaoSpider(scrapy.Spider):
name = 'taobao'
allowed_domains = ['taobao.com']
start_urls = ['https://s.taobao.com/list?spm=a219r.lm5734.0.0.478ae18amAPaDS&q=%E6%96%87%E8%83%B8%E5%A5%97%E8%A3%85&abver=old&input_query=%E6%96%87%E8%83%B8&suggest_offset=0&from=suggest&cat=1625&seller_type=taobao&style=list']
def parse(self, response):
# with open('taobao.html','wb') as f:
# f.write(response.body)
base_url = 'https://s.taobao.com/list?spm=a219r.lm5734.0.0.478ae18amAPaDS&q=%E6%96%87%E8%83%B8%E5%A5%97%E8%A3%85&abver=old&input_query=%E6%96%87%E8%83%B8&suggest_offset=0&from=suggest&cat=1625&seller_type=taobao&style=list&bcoffset=0&s={}'
# s = 0
for i in range(1,11):
s = (i-1)*60
url = base_url.format(s)
# s += 60
# print(url)
yield scrapy.Request(url,callback=self.parse_detail)
def parse_detail(self, response):
html = response.text
titles = re.findall('"raw_title":"(.*?)"', html)
# print(len(titles))
prices = re.findall('"view_price":"(.*?)"', html)
# print(len(prices))
sales = re.findall('"view_sales":"(.*?)"', html)
# print(len(saless))
img_urls = re.findall('"pic_url":"(.*?)"', html)
# print(len(image_urls))
for i in range(0, len(titles)):
item = TaobaoItem()
item['title'] = titles[i]
item['price'] = prices[i]
item['sales'] = sales[i]
item['img_url'] = img_urls[i]
yield item
import scrapy
import json
from scrapy_project.items import TuchongItem
class TuchongSpider(scrapy.Spider):
name = 'tuchong'
allowed_domains = ['tuchong.com']
start_urls = ['https://tuchong.com/rest/tags/%E7%BE%8E%E5%A5%B3/posts?page=1&count=20&order=weekly']
def parse(self, response):
res_dict = json.loads(response.text)
for res in res_dict['postList']:
url = res['url']
# title = res['title']
# excerpt = res['excerpt']
# print(url)
yield scrapy.Request(url,callback=self.get_detail)
def get_detail(self,response):
# with open('tuchong.html','wb') as f:
# f.write(response.body)
srcs = response.xpath('//article[@class="post-content"]/img/@src').extract()
for i in range(0,len(srcs)):
item = TuchongItem()
item['src'] = srcs[i]
# print(srcs[i])
yield item