抓取的正则
- 匹配的正则表达式
//背景图 @"url\s*\(([^\)]+)\)"
//Html标签图 @"<img.*?src=""([^""]*)"".*?>"
- 图片抓取
/// <summary>
/// 网络图片文件上传到服务器上
/// </summary>
/// <param name="networkFileUrl">网络图片的Url地址</param>
/// <param name="serverUri">图片服务器地址</param>
/// <param name="querystring">请求参数</param>
/// <param name="cookies">连带的cookies</param>
/// <returns></returns>
public string SendNetworkFileToServer(string networkFileUrl, string serverUri, NameValueCollection querystring = null, CookieContainer cookies = null)
{
var request = WebRequest.Create(new Uri(networkFileUrl));
byte[] boundaryBytes = null;
Stream requestStream = null;
HttpWebRequest webrequest = null;
HttpWebResponse webResponse=(HttpWebResponse)request.GetResponse();
byte[] imagesBytes = new byte[webResponse.ContentLength];
using (var responseStream = webResponse.GetResponseStream())
{
int readCount = 0;
while (readCount < (int)webResponse.ContentLength)
{
readCount += responseStream.Read(imagesBytes, readCount, (int)webResponse.ContentLength - readCount);
}
}
#region 头部处理
string fileFormName = "file";
string contentType = "image/jpeg";
string fileExtentionStr = ".jpg";
if (networkFileUrl.IndexOf(".jpg") > 0 || networkFileUrl.IndexOf(".jpeg") > 0)
{
contentType = "image/jpeg";
fileExtentionStr = ".jpg";
}
if (networkFileUrl.IndexOf(".png") > 0)
{
contentType = "image/x-png";
fileExtentionStr = ".png";
}
if (networkFileUrl.IndexOf(".bmp") > 0)
{
contentType = "image/bmp";
fileExtentionStr = ".bmp";
}
if (networkFileUrl.IndexOf(".gif") > 0)
{
contentType = "image/gif";
fileExtentionStr = ".gif";
}
if ((fileFormName == null) || (fileFormName.Length == 0))
{
fileFormName = "file";
}
if ((contentType == null) || (contentType.Length == 0))
{
contentType = "application/octet-stream";
}
Uri uri;
if (querystring != null)
{
string postdata;
postdata = "?";
foreach (string key in querystring.Keys)
{
postdata += key + "=" + querystring.Get(key) + "&";
}
postdata = postdata.Trim('&');
uri = new Uri(serverUri + postdata);
}
else
{
uri = new Uri(serverUri);
}
string boundary = "----------" + DateTime.Now.Ticks.ToString("x");
webrequest = (HttpWebRequest)WebRequest.Create(uri);
webrequest.CookieContainer = cookies;
webrequest.ContentType = "multipart/form-data; boundary=" + boundary;
webrequest.Method = "POST";
webrequest.Accept = "*/*";
webrequest.UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36";
// Build up the post message header
StringBuilder sb = new StringBuilder();
sb.Append("--");
sb.Append(boundary);
sb.Append("\r\n");
sb.Append("Content-Disposition: form-data; name=\"");
sb.Append(fileFormName);
sb.Append("\"; filename=\"");
int startCharIndex = networkFileUrl.LastIndexOf("/") + 1;
int endCharIndex = networkFileUrl.IndexOf(fileExtentionStr);
string fileNameAndExtension = networkFileUrl.Substring(startCharIndex, endCharIndex - startCharIndex) + fileExtentionStr;
sb.Append(fileNameAndExtension);
sb.Append("\"");
sb.Append("\r\n");
sb.Append("Content-Type: ");
sb.Append(contentType);
sb.Append("\r\n");
sb.Append("\r\n");
string postHeader = sb.ToString();
byte[] postHeaderBytes = Encoding.ASCII.GetBytes(postHeader);
// Build the trailing boundary string as a byte array
// ensuring the boundary appears on a line by itself
boundaryBytes = Encoding.ASCII.GetBytes("\r\n--" + boundary + "\r\n");
long length = postHeaderBytes.Length + imagesBytes.Length + boundaryBytes.Length;
webrequest.ContentLength = length;
requestStream = webrequest.GetRequestStream();
// Write out our post header
requestStream.Write(postHeaderBytes, 0, postHeaderBytes.Length);
#endregion
requestStream.Write(imagesBytes, 0, imagesBytes.Length);
// Write out the trailing boundary
requestStream.Write(boundaryBytes, 0, boundaryBytes.Length);
WebResponse responce = webrequest.GetResponse();
Stream s = responce.GetResponseStream();
StreamReader sr = new StreamReader(s);
string result = sr.ReadToEnd();
s.Close();
sr.Close();
requestStream.Close();
var objResult = Newtonsoft.Json.JsonConvert.DeserializeObject<UploadResult>(result);
return objResult.url;
}
- 并发处理
//_filterImageSrc.ImagesSrcPair 为 ConcurrentDictionary<string, string> 类型
//其中Key 为替换前的match-value,value 为处理后的image-url
public void ProcessTranslateImageSrc()
{
var imageUrls = this._filterImageSrc.ImagesSrcPair.Keys;
if (imageUrls.Count > 0)
{
var excuteList = imageUrls.AsParallel().WithDegreeOfParallelism(imageUrls.Count > 32 ? 32 : imageUrls.Count)
.Select(imageUrl => ProcessTranslateItem(imageUrl, this._defaultServerUri, this._defaultNameValueCollection, null)).ToList();
foreach (var urlItem in this._filterImageSrc.ImagesSrcPair)
{
this._filterImageSrc.ImagesSrcPair[urlItem.Key] = excuteList.Where(entity => entity.SourceUrl == urlItem.Key).FirstOrDefault().DestUrl;
}
}
}