C语言URL解析器(代码分享)
By qianghaohao(Xqiang)
本程序可以解析出URL中每个字段的值然后存入结构体
中.也可以调用提供的API只获取需要的部分.
本URL解析器代码来自
https://github.com/jwerle/url.h
在此基础上进行了大量的修改,修复了很多bug.总体思路
没有改变,就是把很多细节改了下,现在可以正常使用了.
-->可能还存在bug,如果网友发现了可以及时指正.
感受:花了一天多的时间修复,充分地感受到了IDE对编程
的效率的影响,一个很奇葩的越界导致的问题结果调了半天
没找到问题。。。结果人家五分钟搞定。。。
话不多说,直接上源代码:
接口文件:url_parser.h:
#ifndef URL_PARSER
#define URL_PARSER
/**
* Dependencies
*/
#include <stdlib.h>
#include <stdio.h>
#include <stdbool.h>
#include <string.h>
/**
* url.h version
*/
#define URL_VERSION 0.0.2
/**
* Max length of a url protocol scheme
*/
#define URL_PROTOCOL_MAX_LENGTH 16
/**
* Max length of a url host part
*/
#define URL_HOSTNAME_MAX_LENGTH 128
/**
* Max length of a url tld part
*/
#define URL_TLD_MAX_LENGTH 16
/**
* Max length of a url auth part
*/
#define URL_AUTH_MAX_LENGTH 32
/**
* `url_data` struct that defines parts
* of a parsed URL such as host and protocol
*/
#define URL_MAX_LENGTH 1024
// url结构信息
typedef struct url_data {
char *href;
char *protocol;
char *host;
char *auth;
char *hostname;
char *pathname;
char *search;
char *path;
char *hash;
char *query;
char *port;
} url_data_t;
// prototype
/**
* Parses a url into parts and returns
* a `url_data_t *` pointer
*/
url_data_t *
url_parse (char *url);
char *
url_get_protocol (char *url);
char *
url_get_auth (char *url);
char *
url_get_hostname (char *url);
char *
url_get_host (char *url);
char *
url_get_pathname (char *url);
char *
url_get_path (char *url);
char *
url_get_search (char *url);
char *
url_get_query (char *url);
char *
url_get_hash (char *url);
char *
url_get_port (char *url);
void
url_free (url_data_t *data);
bool
url_is_protocol (char *str);
bool
url_is_ssh (char *str);
void
url_inspect (char *url);
void
url_data_inspect (url_data_t *data);
#endif
实现文件:url_parser.c:
#include "url_parser.h"
/**
* URI Schemes
* http://en.wikipedia.org/wiki/URI_scheme
*/
// URL协议头表:用来判断解析出来的协议是否在此表中
char *URL_SCHEMES[] = {
// official IANA registered schemes
"aaa", "aaas", "about", "acap", "acct", "adiumxtra", "afp", "afs", "aim", "apt", "attachment", "aw",
"beshare", "bitcoin", "bolo", "callto", "cap", "chrome", "crome-extension", "com-evenbrite-attendee",
"cid", "coap", "coaps","content", "crid", "cvs", "data", "dav", "dict", "lna-playsingle", "dln-playcontainer",
"dns", "dtn", "dvb", "ed2k", "facetime", "fax", "feed", "file", "finger", "fish","ftp", "geo", "gg","git",
"gizmoproject", "go", "gopher", "gtalk", "h323", "hcp", "http", "https", "iax", "icap", "icon","im",
"imap", "info", "ipn", "ipp", "irc", "irc6", "ircs