python基于scrapy爬取网页信息,并保存数据。
pycharm,vs code , sublime3 都可以写。
1。打开终端。安装模块(wheel , scrapy ,pymysql。。。等等)
如果有人安装失败,可以试试pip install Scrapy(将scrapy首字母为大写,仅供参考)
2. 安装好后,在cmd中输入:scrapy startproject xxx(项目名,自己定义)创建项目。。。等。
3. 这里用Tencent招聘做例子。
代码部分
– coding: utf-8 –
import scrapy
from tencent.items import TencentItem
from urllib import request
import logging
import time
logging.basicConfig(
level=logging.DEBUG,
format=’%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s’,
datefmt=’%a,%d %b %Y %H:%M:%S’,
filename=‘bole.log’,
filemode=‘w’
)
console = logging.StreamHandler()
console.setLevel(logging.INFO)
formatter = logging.Formatter(’%(name)-12s: %(levelname)-8s %(message)s’)
console.setFormatter(formatter)
logging.getLogger(’’).addHandler(console)
class TencentspiderSpider(scrapy.Spider):
name = ‘tencentspider’
allowed_domains = [‘hr.tencent.com’]
start_urls = []
base_url=‘https://hr.tencent.com/position.php?&start=%s#a’
for i in range(0,5):
url=base_url%(i-1)*10
start_urls.append(url)
def parse(self, response):
job_even=response.xpath('//tr[@class="even"]')
job_odd=response.xpath('//tr[@class="odd"]')
#合并数组
jobs=job_even+job_odd
for job in jobs:
item=TencentItem()
#时间
item['date']=job.xpath('.//td[5]/text()').extract()
#地点
item['location']=job.xpath('.//td[4]/text()').extract()
#职位类别
item['type'] = job.xpath('.//td[2]/text()').extract()
#职位名称
item['name']=job.xpath('.//td[1]/a/text()').extract()
#链接
url = job.xpath('.//td[1]/a/@href').extract()[0]
#拼接全路径
url = request.urljoin(response.url,url)
item['url'] = url
print('~~~~~~~~~~~')
# yield item
#请求详情页
yield scrapy.Request(url=url,callback=self.parse_detail,meta={'data':item})
def parse_detail(self,response):
item=response.meta['data']
# with open('detail.html','w',encoding='utf-8') as f:
# f.write(response.body.decode('utf-8'))
#工作职责
duty=response.xpath('//tr[@class="c"][1]//li/text()').extract()
# print(duty)#字符串列表
duty=''.join(duty)
item['duty'] = duty
#工作要求
rq=response.xpath('//tr[@class="c"][2]//li/text()').extract()
rq=''.join(rq)
item['rq']=rq
print(duty+'\n')
print(rq)
yield item
****setting,item,pipelines文件都需要编译。****
将数据保存为文本格式,代码如下:
– coding: utf-8 –
import os
import time
class TencentPipeline(object):
def init(self):
self.num=0
self.foldername="output"
if not os.path.exists(self.foldername):
os.mkdir(self.foldername)
current_time=time.strftime('%Y-%d-%m',time.localtime())
self.filename="tencent"+current_time+".txt"
def process_item(self, item, spider):
print("写入(成功The success of writen those datas)")
with open(self.foldername+"/"+self.filename,"a+",encoding="utf-8") as fp:
self.num+=1
fp.write("id>"+str(self.num)+"\n")
fp.write("职位类别>"+item['type'][0]+"\n")
fp.write("职位名称>"+item['name'][0]+"\n")
fp.write("地点>"+item['location'][0]+"\n")
fp.write("时间>"+item['date'][0]+"\n")
fp.write("链接>"+item['url']+"\n\n")
fp.write("-----------详情页------------"+"\n")
fp.write("工作职责>"+item['duty']+"\n")
fp.write("工作要求>"+item['rq']+"\n")
return item
最后在cmd中运行,查看结果。
同时也可以用其他模块,比如pyquery,与jQuery方法相似。代码不喜勿喷,请大家原谅。