URL(Uniform Resource Locator),统一资源定位器,用来标识www上某个信息资源,是一种定位资源的主要访问机制的字符串。
URI(Uniform Resource Identifier), 统一资源标识符,用来标识抽象或物理资源的一个紧凑字符串
URN(Uniform Resource Name), 统一资源名称,通过特定命名空间中的唯一名称或ID来标识资源。
关系: URI = URL + URN, 即URL是URI的子集
一个标志的URL必须包括:protocol, host, port(默认80), path, parameter, anchor.
例子(只是例子,不能直接访问:) http://www.baidu.com:80/index.html?name=567&age=23#header
import java.net.MalformedURLException;
import java.net.URL;
public class Main {
public static void main(String[] args) throws MalformedURLException {
URL url = new URL("http://www.baidu.com:80/index.html?name=567&age=23#header");
System.out.println("协议:" + url.getProtocol());
System.out.println("主机:" + url.getHost());
System.out.println("端口:" + url.getPort());
System.out.println("请求资源1:" + url.getFile());
System.out.println("请求资源2:" + url.getPath());
System.out.println("参数:" + url.getQuery());
System.out.println("锚点:" + url.getRef());
}
}
/**
协议:http
主机:www.baidu.com
端口:80
请求资源1:/index.html?name=567&age=23
请求资源2:/index.html
参数:name=567&age=23
锚点:header
*/
爬虫Demo(非商用仅供学习参考使用):
import java.io.*;
import java.net.URL;
import javax.net.ssl.HttpsURLConnection;
public class Main {
public static void main(String[] args) throws Exception {
// 获取URL
//URL url = new URL("https://www.jd.com/");
URL url = new URL("https://www.dianping.com/"); // 直接这样获取有异常报403 没权限
HttpsURLConnection conn = (HttpsURLConnection) url.openConnection();
conn.setRequestMethod("GET");
// 浏览器f12 network项中获取
conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36");
// 下载资源
//InputStream is = url.openStream();
InputStream is = conn.getInputStream();
// 字节流转字符流
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"));
String msg = null;
while (null != (msg = br.readLine())) {
System.out.println(msg);
}
br.close();
}
}