打造网站克隆、仿站工具、含源码
本次的网址克隆工具、仿站工具是用.net core 写的,是控制台程序,适合懂编程的人用(selenium+chromedriver)。该程序模拟了chrome浏览器,能成功抓取页面代码,包括浏览器解析部分的代码,抓取的效果好,比使用一般的抓取页面源码方法要好。
代码部分
(UrlHandler.cs)
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
using ConsoleApp2.Models;
namespace ConsoleApp2.Handlers
{
public class UrlHandler
{
public List<UrlItem> UrlItemList = new List<UrlItem>();
public List<UrlItem> UrlItemListCSS = new List<UrlItem>();
public string PageSource {
get; set; }
public UrlHandler(string pageSource)
{
this.PageSource = pageSource;
this.UrlItemList = new List<UrlItem>();
}
private void ExtractUrl(string pageSource)
{
string pattern = "href=\"(?<Url>.+?)\"|src=\"(?<Url>.+?)\"|url\\(\"*(?<Url>.+?)\"*\\)";
Regex reg = new Regex(pattern, RegexOptions.IgnoreCase | RegexOptions.Singleline);
Console.WriteLine("======================ExtractUrl BEGIN======================");
if (reg.IsMatch(pageSource))
{
foreach (Match mm in reg.Matches(pageSource))
{
var url = mm.Groups["Url"].Value;
var item = new UrlItem
{
Type = GetUrlType(url),
Url = url.Trim(),
IsLocal = true
};
UrlItemList.Add(item);
Console.WriteLine("[URL]:{0}", url);
}
}
Console.WriteLine("======================ExtractUrl END======================");
}
/// <summary>
/// 获取样式内部的资源
/// </summary>
/// <param name="pageSource"></param>
/// <returns></returns>
private List<UrlItem> GetCSSUrl(string pageSource)
{
string pattern = "href=\"(?<Url>.+?)\"|src=\"(?<Url>.+?)\"|url\\(\"*(?<Url>.+?)\"*\\)";
Regex reg = new Regex(pattern, RegexOptions.IgnoreCase | RegexOptions.Singleline);
var list = new List<UrlItem>();
Console.WriteLine("======================GetCSSUrl BEGIN======================");
if (reg.IsMatch(pageSource))
{
foreach (Match mm in reg.Matches(pageSource))
{
var url = mm.Groups["Url"].Value;
var item = new UrlItem
{
Type = GetUrlType(url),
Url = url.Trim(),
IsLocal = true
};
list.Add(item);
Console.WriteLine("[URL]:{0}",url);
}
}
Console.WriteLine("======================GetCSSUrl END======================");
return list;
}
/// <summary>
/// 获取资源的类型
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
private UrlType GetUrlType(string url)
{
var type = UrlType.Other;
var extend = Path.GetExtension(url);
if (string.