Analyzer包含两个核心组件,Tokenizer以及TokenFilter。他们各司其职完成对数据的分析。Tokenizer是字符级别处理流,TokenFilter在词语级别处理他更像是一个分析处理过滤器,我么先份子Tokenizer。少啰嗦,先看类图:
这是一个比较难分析的模块了,需要好好仔细研究。AttributeSource中也包含了很多内容。但总体来说这是一个AttributeImpl管理类。如果一个attribute的实例(即attributeImpl)已经创建,则直接使用,否则的话创建。那么attribute是什么呢?我们在检索全文检索时候,需要知道词的信息,例如:词的位置,词的type,词的偏移等等,这些即是attribute。待我们分析完AttributeSource,再来具体分析这些,attribute。
在AttributeSource中有一个内部类AttributeFactory,他是根据Attribute生成AttributeImpl。
public static abstract class AttributeFactory {
//定义一个方法,通过调用该方法来产生 AttributeImpl
public abstract AttributeImpl createAttributeInstance(Class<? extends Attribute> attClass);
public static final AttributeFactory DEFAULT_ATTRIBUTE_FACTORY = new DefaultAttributeFactory();
//一个继承自AttributeFactory的类
private static final class DefaultAttributeFactory extends AttributeFactory {
//这是自己定义的一个map类,用来存储Attribute 和 AttributeMpl的映射关系。
private static final WeakIdentityMap<Class<? extends Attribute>, WeakReference<Class<? extends AttributeImpl>>> attClassImplMap =
WeakIdentityMap.newConcurrentHashMap(false);
DefaultAttributeFactory() {}
@Override
// 重写createAttributeInstance
public AttributeImpl createAttributeInstance(Class<? extends Attribute> attClass) {
try {
//真正调用产生AttributImpl的方法。
return getClassForInterface(attClass).newInstance();
} catch (InstantiationException e) {
throw new IllegalArgumentException("Could not instantiate implementing class for " + attClass.getName());
} catch (IllegalAccessException e) {
throw new IllegalArgumentException("Could not instantiate implementing class for " + attClass.getName());
}
}
private static Class<? extends AttributeImpl> getClassForInterface(Class<? extends Attribute> attClass) {
//通过get(attClass),获取AttributeImpl
final WeakReference<Class<? extends AttributeImpl>> ref = attClassImplMap.get(attClass);
Class<? extends AttributeImpl> clazz = (ref == null) ? null : ref.get();
//如果为空,则根据Attribute的名字创建相应的AttributeImpl
if (clazz == null) {
// we have the slight chance that another thread may do the same, but who cares?
try {
attClassImplMap.put(attClass,
new WeakReference<Class<? extends AttributeImpl>>(
//首先通过getName获取名字,然后通过forName反射出类.第二个参数是指要初始化,并调用默认的构造函数。
clazz = Class.forName(attClass.getName() + "Impl", true, attClass.getClassLoader())
.asSubclass(AttributeImpl.class)
)
);
} catch (ClassNotFoundException e) {
throw new IllegalArgumentException("Could not find implementing class for " + attClass.getName());
}
}
//如果非空,则直接返回。
return clazz;
}
}
}
其中有一个比较重要的内部类:WeakIdentityMap。这个只需要看完一篇文章就全部理解。https://www.cnblogs.com/huajiezh/p/5835618.html
public final class WeakIdentityMap<K,V> {
private final ReferenceQueue<Object> queue = new ReferenceQueue<Object>();
private final Map<IdentityWeakReference, V> backingStore;
private final boolean reapOnRead;
public static <K,V> WeakIdentityMap<K,V> newHashMap() {
return newHashMap(true);
}
public static <K,V> WeakIdentityMap<K,V> newHashMap(boolean reapOnRead) {
return new WeakIdentityMap<K,V>(new HashMap<IdentityWeakReference,V>(), reapOnRead);
}
public static <K,V> WeakIdentityMap<K,V> newConcurrentHashMap() {
return newConcurrentHashMap(true);
}
//线程安全的HashMap
public static <K,V> WeakIdentityMap<K,V> newConcurrentHashMap(boolean reapOnRead) {
return new WeakIdentityMap<K,V>(new ConcurrentHashMap<IdentityWeakReference,V>(), reapOnRead);
}
private WeakIdentityMap(Map<IdentityWeakReference, V> backingStore, boolean reapOnRead) {
this.backingStore = backingStore;
this.reapOnRead = reapOnRead;
}
public void clear() {
backingStore.clear();
reap();
}
public boolean containsKey(Object key) {
if (reapOnRead) reap();
return backingStore.containsKey(new IdentityWeakReference(key, null));
}
public V get(Object key) {
if (reapOnRead) reap();
return backingStore.get(new IdentityWeakReference(key, null));
}
public V put(K key, V value) {
reap();
return backingStore.put(new IdentityWeakReference(key, queue), value);
}
public boolean isEmpty() {
return size() == 0;
}
public V remove(Object key) {
reap();
return backingStore.remove(new IdentityWeakReference(key, null));
}
public int size() {
if (backingStore.isEmpty())
return 0;
if (reapOnRead) reap();
return backingStore.size();
}
public Iterator<K> keyIterator() {
reap();
final Iterator<IdentityWeakReference> iterator = backingStore.keySet().iterator();
// IMPORTANT: Don't use oal.util.FilterIterator here:
// We need *strong* reference to current key after setNext()!!!
return new Iterator<K>() {
// holds strong reference to next element in backing iterator:
private Object next = null;
// the backing iterator was already consumed:
private boolean nextIsSet = false;
@Override
public boolean hasNext() {
return nextIsSet || setNext();
}
@Override @SuppressWarnings("unchecked")
public K next() {
if (!hasNext()) {
throw new NoSuchElementException();
}
assert nextIsSet;
try {
return (K) next;
} finally {
// release strong reference and invalidate current value:
nextIsSet = false;
next = null;
}
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
private boolean setNext() {
assert !nextIsSet;
while (iterator.hasNext()) {
next = iterator.next().get();
if (next == null) {
// the key was already GCed, we can remove it from backing map:
iterator.remove();
} else {
// unfold "null" special value:
if (next == NULL) {
next = null;
}
return nextIsSet = true;
}
}
return false;
}
};
}
public Iterator<V> valueIterator() {
if (reapOnRead) reap();
return backingStore.values().iterator();
}
public void reap() {
Reference<?> zombie;
while ((zombie = queue.poll()) != null) {
backingStore.remove(zombie);
}
}
// we keep a hard reference to our NULL key, so map supports null keys that never get GCed:
static final Object NULL = new Object();
private static final class IdentityWeakReference extends WeakReference<Object> {
private final int hash;
IdentityWeakReference(Object obj, ReferenceQueue<Object> queue) {
super(obj == null ? NULL : obj, queue);
hash = System.identityHashCode(obj);
}
@Override
public int hashCode() {
return hash;
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o instanceof IdentityWeakReference) {
final IdentityWeakReference ref = (IdentityWeakReference)o;
if (this.get() == ref.get()) {
return true;
}
}
return false;
}
}
}
接下来是三个比较重要的变量:
private final Map<Class<? extends Attribute>, AttributeImpl> attributes;
private final Map<Class<? extends AttributeImpl>, AttributeImpl> attributeImpls;
private final State[] currentState;
上述两个成员保存了两种映射关系,AttributeImpl实例对应实现的所有Attribute接口,都可以映射到该AttributeImpl实例,这是第一个映射;第二个映射是AttributeImpl实例对应实现的AttributeImpl抽象类对该AttributeImpl实例的映射。
设计这两个映射关系的目的是在该AttributeSource实例中对每个Attribute和AttributeImpl保证只有一个AttributeImpl实例,换句话说,当用具体Attribute或者具体AttributeImpl获取其对象实例时,不会每次都新建实例,而是首次时建立,其后只返回以前建立的。其维护上述两种关系的代码如下:
一般首先调用addAttributeImpl
private static final WeakIdentityMap<Class<? extends AttributeImpl>,LinkedList<WeakReference<Class<? extends Attribute>>>> knownImplClasses =
WeakIdentityMap.newConcurrentHashMap(false);
/*
* 从全局的attributeImpl,与其使用的接口对应map中查询。
* 返回的结果是一个LinkendList,即他会包含多个接口。
*/
static LinkedList<WeakReference<Class<? extends Attribute>>> getAttributeInterfaces(final Class<? extends AttributeImpl> clazz) {
LinkedList<WeakReference<Class<? extends Attribute>>> foundInterfaces = knownImplClasses.get(clazz);
if (foundInterfaces == null) {
// we have the slight chance that another thread may do the same, but who cares?
foundInterfaces = new LinkedList<WeakReference<Class<? extends Attribute>>>();
// find all interfaces that this attribute instance implements
// and that extend the Attribute interface
Class<?> actClazz = clazz;
do {
for (Class<?> curInterface : actClazz.getInterfaces()) {
if (curInterface != Attribute.class && Attribute.class.isAssignableFrom(curInterface)) {
foundInterfaces.add(new WeakReference<Class<? extends Attribute>>(curInterface.asSubclass(Attribute.class)));
}
}
actClazz = actClazz.getSuperclass();
} while (actClazz != null);
knownImplClasses.put(clazz, foundInterfaces);
}
return foundInterfaces;
}
/*
* 第二步是调用此方法,首先getClass,生成类型类,然后在attributeImpls中查找是否有次键值,attributeImpls存储的是,attribureImpl
* 类,与其真正实例的关系。如果存在则直接返回。
* 如果不存在,则先调用getAttributeInterfaces。该函数的作用是从全局的AttributeImpl和其所用到的全部接口的对应关系。即调用该函数,
* 会得到AttributeImpl继承的全部的接口类,并将其返回。
* 得到接口后,判断是否为空,如果不为空的话,则去attribute中判断是否包含该键值,不包含的话,则加入。
*/
public final void addAttributeImpl(final AttributeImpl att) {
final Class<? extends AttributeImpl> clazz = att.getClass();
if (attributeImpls.containsKey(clazz)) return;
final LinkedList<WeakReference<Class<? extends Attribute>>> foundInterfaces =
getAttributeInterfaces(clazz);
// add all interfaces of this AttributeImpl to the maps
for (WeakReference<Class<? extends Attribute>> curInterfaceRef : foundInterfaces) {
final Class<? extends Attribute> curInterface = curInterfaceRef.get();
assert (curInterface != null) :
"We have a strong reference on the class holding the interfaces, so they should never get evicted";
// Attribute is a superclass of this interface
if (!attributes.containsKey(curInterface)) {
// invalidate state to force recomputation in captureState()
this.currentState[0] = null;
attributes.put(curInterface, att);
attributeImpls.put(clazz, att);
}
}
}
/*
*首先调用次函数。通过attribute.get查看当前已经缓存的attribute与AttributeImpl对应map中存在不存在该键值。
*如果存在该键值,则通过attClass.catt(attimpl)语句,将该attClass强制转换为AttributeImpl类型。
*如果不存在,调用createAttributeInstance,这个在前文已经解释过,即通过Attribute生成对应AttributeImpl。
*然后调用了 addAttributeImpl()请往上看。
*/
public final <T extends Attribute> T addAttribute(Class<T> attClass) {
AttributeImpl attImpl = attributes.get(attClass);
if (attImpl == null) {
if (!(attClass.isInterface() && Attribute.class.isAssignableFrom(attClass))) {
throw new IllegalArgumentException(
"addAttribute() only accepts an interface that extends Attribute, but " +
attClass.getName() + " does not fulfil this contract."
);
}
addAttributeImpl(attImpl = this.factory.createAttributeInstance(attClass));
}
return attClass.cast(attImpl);
}
程序中缓存了如此多的,attributeImpl,怎么快速遍历他们呢。这就需要用到下边的类了。
public static final class State implements Cloneable {
AttributeImpl attribute;
State next;
@Override
public State clone() {
State clone = new State();
clone.attribute = attribute.clone();
if (next != null) {
clone.next = next.clone();
}
return clone;
}
}
在上边提到的比较关键的三个类之中还有一个currentState没介绍,而且在之前的添加attribute与attributeImpl等等映射关系时候,它并没有初始化,而是this.currentState[0] = null;
它真正的初始化是在。
public final State captureState() {
final State state = this.getCurrentState();
return (state == null) ? null : state.clone();
}
咦怎么没有呢?你特么是在逗我么?亲爱的小伙子消消火,看this.getCurrentState()
private State getCurrentState() {
//类似于c++中指针的概念,用s 指向currentState[0]
State s = currentState[0];
if (s != null || !hasAttributes()) {
return s;
}
//这个就设计精妙了,用两个指针指向currentState[0],并进行初始化。s是指向了链表的头部,用于返回,c指针用于遍历赋值
State c = s = currentState[0] = new State();
final Iterator<AttributeImpl> it = attributeImpls.values().iterator();
//因为attributeImpls.values()返回的是Collection.而它初始值是指向一个空的。所以需要先进行next()操作
c.attribute = it.next();
while (it.hasNext()) {
c.next = new State();
c = c.next;
c.attribute = it.next();
}
return s;
}
初始化完了以后,我们需要遍历,那遍历来了:
public final Iterator<AttributeImpl> getAttributeImplsIterator() {
final State initState = getCurrentState();
if (initState != null) {
return new Iterator<AttributeImpl>() {
private State state = initState;
@Override
public void remove() {
throw new UnsupportedOperationException();
}
@Override
/*
* 此处比较奇怪哇,为什么先进行了next呢?为什么呢?为什么呢?
* 经过思考明白了,我们需要惊醒next的时候,我们要先进行判断是不是next为空,当为空的时候,应该抛出异常了。
* 所以代码中,如果不存在则抛出异常了,然后再返回上一次存下来的state
*/
public AttributeImpl next() {
if (state == null)
throw new NoSuchElementException();
final AttributeImpl att = state.attribute;
state = state.next;
return att;
}
@Override
public boolean hasNext() {
return state != null;
}
};
} else {
return Collections.<AttributeImpl>emptySet().iterator();
}
}
最后还有一个方法值得介绍一下:reflectAsString。它是返回当前attribute的值。返回的形式是String。
public final String reflectAsString(final boolean prependAttClass) {
final StringBuilder buffer = new StringBuilder();
reflectWith(new AttributeReflector() {
@Override
public void reflect(Class<? extends Attribute> attClass, String key, Object value) {
if (buffer.length() > 0) {
buffer.append(',');
}
if (prependAttClass) {
//插入名称
buffer.append(attClass.getName()).append('#');
}
//插入值。
buffer.append(key).append('=').append((value == null) ? "null" : value);
}
});
return buffer.toString();
}
看到这里其实还是一头雾水吧?这个到底怎么返回的string?他的形参是需要有key和value的。
借着看:
public final void reflectWith(AttributeReflector reflector) {
for (State state = getCurrentState(); state != null; state = state.next) {
state.attribute.reflectWith(reflector);
}
}
最终我们看到了。其实追中调用的。应该是attribute中的reflectWith。
到这里这个特别繁琐的类已经靠一段落了。我们已经知道了这个类是管理attribute的,但是这个attribute是什么呢?它有实现了什么功能呢?
请期待下一篇博客...........