浏览器工作原理,动手码一个简易浏览器(1):HTML解析

1. 本节代码

本节代码,git仓库地址

2. 状态机(有限状态机)

2.1 概念知识

  1. 每一个状态都是一个机器
    • 在每个机器里,我们可以做计算、存储、输出…
    • 所有的这些机器接受的输入是一致的
    • 状态机的每一个机器本身没有状态,如果我们用函数来表示的话,他应该是纯函数(无副作用,函数内部不再受函数外部的输入控制)
  2. 每一个计器知道下一个状态
    • 每个机器都有确定的下一个状态(Moore)
    • 每个机器根据 输入决定下一个状态(Mealy)
	//每个函数是一个状态
	function state(input){ //函数参数就是输入
		//TODO 在函数中可以自由编写代码,处理每个状态的逻辑
		
		return next; //返回值将作为下一个状态
	}
	/***********调用************/
	while(input) {
		//获取输入
		state = state(input); //把状态机的返回值作为下一个状态
	}

2.2 使用状态机处理字符

1). 使用状态机查询字符串中是否包含"abc"

	function match(src) {
	    let state = start;
	    for(let char of src) {
	        state = state(char);
	    }
		//最终判断状态是否处于end(结束)状态
	    return state === end;
	}
	
	function start(char) {
	    if(char === 'a') {
	        return foundA;
	    }else{
	        return start;
	    }
	}
	
	function foundA(char) {
	    if(char === 'b') {
	        return foundB;
	    }else{
	    	//注意此处调用start函数
	        return start(char);
	    }
	}
	
	function foundB(char) {
	    if(char === 'c') {
	        return end;
	    }else{
	        return start(char);
	    }
	}
	
	function end(c) {
	    return end;
	}

2). 使用状态机查询字符串中是否包含"abcabx"

	function match(src) {
	    let state = start;
	    for(let char of src) {
	        state = state(char);
	    }
	
	    return state === end;
	}
	
	function start(char) {
	    if(char === 'a') {
	        return foundA;
	    }else{
	        return start;
	    }
	}
	
	function foundA(char) {
	    if(char === 'b') {
	        return foundB;
	    }else{
	        return start(char);
	    }
	}
	
	function foundB(char) {
	    if(char === 'c') {
	        return foundC;
	    }else{
	        return start(char);
	    }
	}
	
	function foundC(char) {
	    if(char === 'a') {
	        return foundA2;
	    }else{
	        return start(char);
	    }
	}
	
	function foundA2(char) {
	    if(char === 'b') {
	        return foundB2;
	    }else{
	        return start(char);
	    }
	}

	function foundB2(char) {
	    if(char === 'x') {
	        return end;
	    }else{
	    	//TODO 注意此处调用了foundB函数
	        return foundB(char);
	    }
	}
	
	function end(c) {
	    return end;
	}

3). 使用状态机查询字符串中是否包含"abababx"

	function match(src) {
	    let state = start;
	    for(let char of src) {
	        state = state(char);
	    }
	
	    return state === end;
	}
	
	function start(char) {
	    if(char === 'a') {
	        return foundA;
	    }else{
	        return start;
	    }
	}
	
	function foundA(char) {
	    if(char === 'b') {
	        return foundB;
	    }else{
	        return start(char);
	    }
	}
	
	function foundB(char) {
	    if(char === 'a') {
	        return foundA2;
	    }else{
	        return start(char);
	    }
	}
	
	
	function foundA2(char) {
	    if(char === 'b') {
	        return foundB2;
	    }else{
	        return start(char);
	    }
	}
	
	function foundB2(char) {
	    if(char === 'a') {
	        return foundA3;
	    }else{
	        return foundB(char);
	    }
	}
	
	function foundA3(char) {
	    if(char === 'b') {
	        return foundB3;
	    }else{
	        return start(char);
	    }
	}
	
	function foundB3(char) {
	    if(char === 'x') {
	        return end;
	    }else{
	        return foundB2(char);
	    }
	}
	
	function end(c) {
	    return end;
	}

2.3 字符串 KMP 算法

1). 参考链接

3. 简易浏览器实现

在这里插入图片描述

3.1 HTTP协议解析

  1. ISO-OSI七层网络模型
    在这里插入图片描述

  2. Request结构
    在这里插入图片描述

  3. Response结构
    在这里插入图片描述

  4. 创建sever

const http = require('http');

http.createServer((request, response) => {
    let body = [];
    request.on('error', (err) => {
        console.error(err);
    }).on('data', (chunk) => {
        body.push(chunk);
    }).on('end', () => {
        body = Buffer.concat(body).toString();
        console.log("body:", body);
        response.writeHead(200, {
            'Content-type': 'text/html'
        });
        response.end('Hello World\n');
    });
}).listen(8088);

console.log('server started!')
  1. 创建client
//ChunkedBodyParser.js
class ChunkedBodyParser {

    length = 0;
    content = [];
    isFinished = false;
    state = this.waitingLength;

    constructor() {}

    parser(char) {
        this.state = this.state(char);
    }

    waitingLength(char){
        if(char === '\r') {
            if(this.length === 0) {
                this.isFinished = true;
                return this.end;
            }

            return this.waitingLengthEnd;
        }else {
            this.length *= 16;
            this.length += parseInt(char, 16);
            return this.waitingLength;
        }
    }

    waitingLengthEnd(char) {
        if(char === '\n') {
            return this.readingChunk;
        }else {
            return this.waitingLengthEnd;
        }
    }

    readingChunk(char) {
        this.content.push(char);
        this.length -= char.length;
        if(this.length === 0) {
            return this.waitingNewLine;
        }else {
            return this.readingChunk;
        }
    }

    waitingNewLine(char){
        if(char === '\r') {
            return this.waitingNewLineEnd;
        }else {
            return this.waitingNewLine;
        }
    }

    waitingNewLineEnd(char) {
        if(char === '\n') {
            return this.waitingLength;
        }else {
            return this.waitingNewLineEnd;
        }
    }

    end(char) {
        return this.end;
    }
}

module.exports = ChunkedBodyParser;
//ResponseParser.js
const ChunkedBodyParser = require('./ChunkedBodyParser');

class ResponseParser {
    statusLine = '';
    headers = {};
    headerName = '';
    headerValue = '';
    bodyParser = null;

    constructor() {}

    get isFinished() {
        return this.bodyParser && this.bodyParser.isFinished;
    }

    get response() {
        this.statusLine.match(/HTTP\/1.1 ([0-9]+) ([\s\S]+)/);
        return {
            statusCode: RegExp.$1,
            statusText: RegExp.$2,
            headers: this.headers,
            body: this.bodyParser.content.join('')
        }
    }

    receive(str) {
        this.state = this.buildStatusLine;
        for(let position = 0; position < str.length; position++) {
            let char = str.charAt(position);
            this.state = this.state(str.charAt(position));
        }
    }

    buildStatusLine(char){
        if(char === '\r') {
            return this.waitStatusLineEnd;
        }else {
            this.statusLine += char;
            return this.buildStatusLine;
        }
    }

    waitStatusLineEnd(char) {
        if (char === '\n') {
            return this.buildHeaderName;
        } else {
            return this.waitStatusLineEnd;
        }
    }

    buildHeaderName(char) {
        if(char === ':') {
            return this.waitHeaderSpace;
        }else if(char === '\r') {
            return this.headerBuildEnd;
        }else {
            this.headerName += char;
            return this.buildHeaderName;
        }
    }

    waitHeaderSpace(char){
        if(char === ' ') {
            return this.buildHeaderValue;
        }else {
            return this.buildHeaderValue;
        }
    }

    buildHeaderValue(char){
        if(char === '\r') {
            this.headers[this.headerName] = this.headerValue;
            this.headerName = '';
            this.headerValue = '';
            return this.waitStatusLineEnd;
        }else {
            this.headerValue += char;
            return this.buildHeaderValue;
        }
    }

    waitHeaderLineEnd(char){
        if(char === '\n') {
            return this.buildHeaderName;
        }else {
            return this.waitHeaderLineEnd;
        }
    }


    headerBuildEnd(char){
        if(char === '\n') {
            if(this.headers['Transfer-Encoding'] === 'chunked') {
                this.bodyParser = new ChunkedBodyParser();
            }
            return this.buildBody;
        }else {
            return this.headerBuildEnd;
        }
    }

    buildBody(char){
        if(this.bodyParser) {
            this.bodyParser.parser(char);
        }
        return this.buildBody;
    }

}

module.exports = ResponseParser;
//client.js
const net = require('net');
const ResponseParser = require('./ResponseParser');
const HtmlParser = require('./HtmlParser');

class Request {

    constructor(options) {
        this.method = options.method || 'GET';
        this.host = options.host;
        this.port = options.port || 80;
        this.path = options.path || '/';
        this.body = options.body || {};
        this.headers = options.headers || {};

        if(!this.headers['Content-Type']) {
            this.headers['Content-Type'] = 'application/x-www-form-urlencoded';
        }

        if(this.headers['Content-Type'] === 'application/json') {
            this.bodyText = JSON.stringify(this.body);
        }else if(this.headers['Content-Type'] === 'application/x-www-form-urlencoded') {
            this.bodyText = Object.keys(this.body).map(key=>{
                return `${key}=${encodeURIComponent(this.body[key])}`;
            }).join('&');
        }

        this.headers['Content-length'] = this.bodyText.length;
    }

    toString(){
        return `${this.method} ${this.path} HTTP/1.1\r
${Object.keys(this.headers).map(key=>`${key}: ${this.headers[key]}`).join('\r\n')}\r
\r
${this.bodyText}`;
    }

    send(connection) {
        return new Promise((resolve, reject) => {
            let parser = new ResponseParser();
            if(connection) {
                connection.write(this.toString());
            }else {
                connection = net.createConnection({
                    host: this.host,
                    port: this.port
                }, () => {
                    connection.write(this.toString());
                });
            }

            connection.on('data', data =>{
                console.log(data.toString());
                parser.receive(data.toString());

                if(parser.isFinished) {
                    resolve(parser.response);
                    connection.end();
                }
            });

            connection.on('error', err => {
                reject(err);
                connection.end();
            })
        })
    }
}

void async function () {
    let request = new Request({
        method: 'POST',
        host: '127.0.0.1',
        port: '8088',
        path: '/',
        headers: {
            ['X-Foo2']: 'custom',
        },
        body: {
            name: 'world'
        }
    });

    let response = await request.send();
	let dom = HtmlParser.parserHTML(response.body);
}();

3.2 HTML解析

  1. 使用有限状态机(FSM)实现
  2. html的标准中,已经规定了html的状态
  3. 简易浏览器只实现部分html状态
  1. 解析标签

标签分类:开始标签、结束标签、自封闭标签

  1. 创建元素
  2. 处理属性
  3. 用token构建DOM树
  1. 使用栈
  2. 遇到开始标签时创建元素并入栈,遇到结束标签时出栈
  3. 自封闭节点认为入栈后立即出栈
  4. 任何元素的父元素是他入栈前的栈顶元素
  1. 将文本节点加到DOM树
  1. 文本节点类似于自封闭标签,不会入栈
  2. 要注意多个文本节点的合并
//HtmlParser.js
let currentToken = null;
let currentAttribute = null;
let currentTextNode = null;
let root = {type: 'document', children: []};
let stack = [root];

const EOF = Symbol('EOF');

function emit(token) {
    let top = stack[stack.length - 1];

    if (token.type === 'startTag') {
        let element = {
            type: 'element',
            children: [],
            attributes: []
        }

        element.tagName = token.tagName;

        for (let key in token) {
            if (key !== 'type' && key !== 'tagName') {
                element.attributes.push({
                    name: key,
                    value: token[key]
                })
            }
        }

        top.children.push(element);
        element.parent = top;

        if (!token.isSelfClosing) {
            stack.push(element);
        }

        currentTextNode = null;
    } else if (token.type === 'endTag') {
        if (top.tagName !== token.tagName) {
            throw new Error(token.tagName + " Tag start end doesn't match!");
        } else {
            stack.pop();
        }
        currentTextNode = null;
    } else if (token.type === 'text') {
        if (!currentTextNode) {
            currentTextNode = {
                type: 'text',
                content: ''
            }
            top.children.push(currentTextNode);
        }
        currentTextNode.content += token.content;
    }
}

function data(char) {
    if (char === '<') {
        return tagOpen;
    } else if (char === EOF) {
        emit({
            type: 'EOF'
        });
    } else {
        emit({
            type: 'text',
            content: char
        })
        return data;
    }
}

function tagOpen(char) {
    if (char === '/') {
        return endTagOpen;
    } else if (char.match('^[a-zA-Z]$')) {
        currentToken = {
            type: 'startTag',
            tagName: ''
        }
        return tagName(char);
    }
}

function endTagOpen(char) {
    if (char.match('^[a-zA-Z]$')) {
        currentToken = {
            type: 'endTag',
            tagName: ''
        }
        return tagName(char);
    } else if (char === '>') {

    } else if (char === EOF) {

    } else {
    }
}

function tagName(char) {
    if (char.match(/^[\r\t\f ]$/)) {
        return beforeAttributeName;
    } else if (char === '/') {
        return selfClosingStartTag;
    } else if (char.match('^[a-zA-Z]$')) {
        currentToken.tagName += char.toLowerCase();
        return tagName;
    } else if (char === '>') {
        emit(currentToken);
        return data;
    } else {
        return tagName;
    }
}

//解析标签属性
function beforeAttributeName(char) {
    if (char.match(/^[\t\n\f ]$/)) {
        return beforeAttributeName;
    } else if (char === '/' || char === '>' || char === EOF) {
        return afterAttributeName(char);
    } else if (char === '=') {

    } else {
        currentAttribute = {
            name: '',
            value: ''
        }
        return attributeName(char);
    }
}

function attributeName(char) {
    if (char === "=") {
        return beforeAttributeValue;
    } else if (char.match(/^[\r\t\f ]$/) || char === '/' || char === '>' || char === EOF) {
        return afterAttributeName(c);
    } else if (char === '\u0000') {

    } else if (char === '"' || char === "'" || char === '<') {

    } else {
        currentAttribute.name += char;
        return attributeName;
    }
}

function afterAttributeName(char) {

}

function beforeAttributeValue(char) {
    if (char.match(/^[\r\t\f ]$/) || char === '/' || char === '>' || char === EOF) {
        return beforeAttributeValue;
    } else if (char === '"') {
        return doubleQuotedAttributeValue;
    } else if (char === "'") {
        return singleQuotedAttributeValue;
    } else {
        return unquotedAttributeValue(char);
    }
}

function afterQuotedAttributeValue(char) {
    if (char.match(/^[\r\t\f ]$/)) {
        return beforeAttributeName;
    } else if (char === '/') {
        return selfClosingStartTag;
    } else if (char === '>') {
        currentToken[currentAttribute.name] = currentAttribute.value;
        emit(currentToken);
        return data;
    } else if (char === EOF) {

    } else {
        currentAttribute.value += char;
        return doubleQuotedAttributeValue;
    }
}

function doubleQuotedAttributeValue(char) {
    if (char === '"') {
        currentToken[currentAttribute.name] = currentAttribute.value;
        return afterQuotedAttributeValue;
    } else if (char === '\u0000') {

    } else if (char === EOF) {

    } else {
        currentAttribute.value += char;
        return doubleQuotedAttributeValue;
    }
}

function singleQuotedAttributeValue(char) {
    if (char === "'") {
        currentToken[currentAttribute.name] = currentAttribute.value;
        return afterQuotedAttributeValue;
    } else if (char === '\u0000') {

    } else if (char === EOF) {

    } else {
        currentAttribute.value += char;
        return doubleQuotedAttributeValue;
    }
}

function unquotedAttributeValue(char) {
    if (char.match(/^[\r\t\f ]$/)) {
        currentToken[currentAttribute.name] = currentAttribute.value;
        return beforeAttributeName;
    } else if (char === '/') {
        currentToken[currentAttribute.name] = currentAttribute.value;
        return selfClosingStartTag;
    } else if (char === '>') {
        currentToken[currentAttribute.name] = currentAttribute.value;
        emit(currentToken);
        return data;
    } else if (char === '\u0000') {

    } else if (char === '"' || char === "'" || char === '<' || char === '=' || char === '`') {

    } else if (char === EOF) {

    } else {
        currentAttribute.value += char;
        return unquotedAttributeValue;
    }
}

//属性处理结束

function selfClosingStartTag(char) {
    if (char === '>') {
        currentToken.isSelfClosing = true;
        return data;
    } else if (char === EOF) {

    } else {

    }
}

module.exports.parserHTML = function (html) {
    let state = data;
    for (let char of html) {
        state = state(char);
    }
    state = state(EOF);
    console.log(root);
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值