无数据库存储
JSON
python利用json模块对json进行编码和解码
编码使用函数dump和dumps
dump将python对象编码为json对象并存入fp指定文件;dumps将python对象生成字符串。
dumps(obj, skipkeys=False, ensure_ascii=True, check_circular=True, allow_nan=True, cls=None, indent=None, separators=None, encoding='utf-8', default=None, sort_keys=False,**kw)
dump(obj, fp, skipkeys=False, ensure_ascii=True, check_circular=True, allow_nan=True, cls=None, indent=None, separators=None, encoding='utf-8', default=None, sort_keys=False,**kw)
常用参数:
skipkeys:若字典内的keys数据不是Python的基本类型,设为false会报typeerror错,否则会跳过key
ensure_ascii:为True,若字典内含有非ASCII字符会以“\uXXXX”格式显示,设置为False,就可正常显示
indent: 应为非负整型,若为0或None则显示一行数据,否则会换行并按照indent值显示空白,格式化显示
separators:分隔符,元组(item_separators,dic_separator),默认为(‘,’,’:’)
encoding: 设置JSON数据编码方式
sort_keys:根据keys值排序
解码使用函数load和loads
loads(s, encoding=None, cls=None, object_hook=None, parse_float=None, parse_int=None, parse_constant=None, object_pairs_hook=None,**kw)
load(s, fp, encoding=None, cls=None, object_hook=None, parse_float=None, parse_int=None, parse_constant=None, object_pairs_hook=None,**kw)
encoding:指定编码格式
parse_float:如果指定,将把每个JSON字符串按float解码调用,相当于float(num_str)
对象转化规则
CSV
字符分隔值CSV 纯文本形式存储数据;使用python的CSV库
csv 文件编码针对列表使用writer,字典使用DictWriter对象
import csv
headers = ['id','user','pas']
rows = [(1001,"lll","123"),
(1002,"rrr","123"),
(1003,"zzz","123"),
]
with open('file.csv','w') as f:
f_csv = csv.writer(f)
f_csv.writerrow(headers)
f_csv.writerrows(rows)
#字典
with open(file,'a',newline='') as f:
writer = csv.DictWriter(f,['title','summary','pic_link'])
writer.writeheader()
writer.writerows(self.datas)
csv读取需要reader对象
from collections import namedtuple
with open('file.csv','r') as f:
f_csv = csv.reader(f)
headers = next(f_csv)
print(headers)
for row in f_csv:
print(row) #输出列表形式
print(row[0])
#或采用命名方式
headings = next(f_csv)
Row = namedtuple('Row',heading)
for r in f_csv:
row = Row(*r)
print(row.user,row.pas)
#或直接使用get
print(row.get('user'))
多媒体文件抽取
urllib模块提供了函数urlretrieve(),将远程数据下载到本地
urlretrieve(url, filename=None, reporthook=None, data=None)
url:下载资源链接
filename:存储路径
reporthook:回调函数,1.连接服务器成功2.下载完一个数据块时会触发一次该函数,可以用来表示下载进度
data: 指post到服务器的数据,返回一个包含两元素的元组(filename, headers)本地路径和服务器响应头。
Email提醒
常使用email对异常或成功事件提醒。
###爬虫基本构成与实践
#coding:utf-8
import requests
import csv
import json
import re
import urllib
from bs4 import BeautifulSoup as BS
from email.mime.text import MIMEText
from email.header import Header
from email.utils import parseaddr, formataddr
import smtplib
class UrlManager(object):
def __init__(self):
self.new_urls = set()
self.old_urls = set()
def has_new_urls(self):
return self.new_url_size()!=0
def get_new_url(self):
if(self.has_new_urls()):
url = self.new_urls.pop()
self.old_urls.add(url)
return url
else:
return NULL
def add_new_url(self, new_url):
if new_url is None:
return False
if new_url not in self.old_urls: #更改 不需要测试是否在new中
self.new_urls.add(new_url)
return True
def add_new_urls(self, new_urls):
for url in new_urls:
self.add_new_url(url)
def new_url_size(self):
return len(self.new_urls)
class HtmlDownloader(object):
def download(self, url):
if url is None:
return None
user_agent = 'Mozilla/4.0(compatible; Chrome 80.0.3987.116; Windows 10)'
headers = {'User-Agent':user_agent}
r = requests.get(url,headers=headers)
if r.status_code==200:
r.encoding='utf-8'
return r.text
return None
class HtmlParser(object):
def paser(self, page_url, soup):
if page_url is None or soup is None:
return None
new_urls = self.get_new_urls(page_url,soup)
new_data = self.get_new_data(page_url,soup)
return new_urls, new_data
def get_new_urls(self, page_url, soup):
urls = []
links = []
links = soup.find_all('a',href = re.compile(r'/item/.+/\d+'))
for link in links:
url = link['href']
full_url = urllib.parse.urljoin(page_url, url)
#print(full_url)
urls.append(full_url)
return urls
def get_new_data(self,page_url,soup):
pattern = re.compile(r'\D')
title = soup.find('dd',class_ = "lemmaWgt-lemmaTitle-title").find('h1').string
summary = soup.find('div',class_="lemma-summary").get_text()
summary = pattern.sub('',summary)
pic_link = soup.find('img').get('src')
#urllib.request.urlretrieve(pic_link,'img'+str(title)+'.jpg',self.schedule)
dic_data = {'title':title,'summary':summary,'pic_link':pic_link}
return dic_data
def schedule(self,blocknum,blocksize,totalsize):
per = 100.0*blocknum*blocksize/totalsize
if per > 100.0:
per = 100.0
print('当前下载进度:{}>{:.3f}%'.format('-'*int(per/10),per), end='\r')
class DataOutput(object):
def __init__(self):
self.datas = []
def store_data(self, data, file,num):
if data is None:
return
self.datas.append(data)
if len(self.datas) == 10 or num==0:
self.output(file)
self.datas = []
def output(self,file):
with open(file,'a',newline='') as f:
writer = csv.DictWriter(f,['title','summary','pic_link'])
writer.writeheader()
writer.writerows(self.datas)
class SpiderControl(object):
def __init__(self):
self.urlmanager = UrlManager()
self.hpaser = HtmlParser()
self.hdownloader = HtmlDownloader()
self.output = DataOutput()
def crawl(self, url, num):
link_num=0
self.urlmanager.add_new_url(url)
while(self.urlmanager.has_new_urls() and link_num < num):
using_url = self.urlmanager.get_new_url()
link_num+=1
html_text = self.hdownloader.download(using_url)
soup = BS(html_text,from_encoding='utf-8')
n_urls, n_data = self.hpaser.paser(using_url, soup)
self.urlmanager.add_new_urls(n_urls)
self.output.store_data(n_data,'store.csv',num-link_num)
#print()
print("已抓取{}个链接".format(link_num),end='\r')
def _format_addr(self,s):
name, addr = parseaddr(s)
retuen formataddr((Header(name,'utf-8').encode(),addr))
def transemail(self):
from_addr = 'xxxxx@163.com' #注册的开启smtp功能的邮箱
password = 'xxxxxxx'
to_addr = 'xxxx@qq.com'
smtp_server = 'smtp.163.com'
msg = MIMEText("抓取完成",'plain','utf-8') #纯文本文件
msg['From'] = _format_addr('爬虫<%s>'% from_addr)
msg['To'] = _format_addr('管理<%s>'% to_addr)
msg['Subject'] = Header('状态','utf-8').encode()
server = smtplib.SMTP(smtp_server,25)
server.login(from_addr,password)
server.sendmail(from_addr,[to_addr],msg.as_string())
server.quit()
sc = SpiderControl()
sc.crawl("https://baike.baidu.com/item/美国/125486",104)