/// <summary>
/// Html常用帮助类
/// </summary>
public class HtmlHelper
{
#region 获取页面源代码
/// <summary>
/// 获取网页源代码
/// </summary>
/// <param name="url">URL路径</param>
/// <param name="encoding">编码方式</param>
/// <returns></returns>
public static string GetHTML(string url, string encoding)
{
WebClient web = new WebClient();
byte[] buffer = web.DownloadData(url);
return Encoding.GetEncoding(encoding).GetString(buffer);
}
/// <summary>
/// WebClient读取源代码
/// </summary>
/// <param name="url">URL路径</param>
/// <param name="encoding">编码方式</param>
/// <returns></returns>
public static string GetWebClient(string url, string encoding)
{
string strHTML = "";
WebClient myWebClient = new WebClient();
Stream myStream = myWebClient.OpenRead(url);
StreamReader sr = new StreamReader(myStream, System.Text.Encoding.GetEncoding(encoding));
strHTML = sr.ReadToEnd();
myStream.Close();
return strHTML;
}
/// <summary>
/// WebRequest读取源代码
/// </summary>
/// <param name="url">URL路径</param>
/// <param name="encoding">编码方式</param>
/// <returns></returns>
public static string GetWebRequest(string url, string encoding)
{
Uri uri = new Uri(url);
WebRequest myReq = WebRequest.Create(uri);
WebResponse result = myReq.GetResponse();
Stream receviceStream = result.GetResponseStream();
StreamReader readerOfStream = new StreamReader(receviceStream, System.Text.Encoding.GetEncoding(encoding));
string strHTML = readerOfStream.ReadToEnd();
readerOfStream.Close();
receviceStream.Close();
result.Close();
return strHTML;
}
/// <summary>
/// HttpWebRequest读取源代码
/// </summary>
/// <param name="url">URL路径</param>
/// <param name="encoding">编码方式</param>
/// <returns></returns>
public static string GetHttpWebRequest(string url, string encoding)
{
Uri uri = new Uri(url);
HttpWebRequest myReq = (HttpWebRequest)WebRequest.Create(uri);
myReq.UserAgent = "User-Agent:Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705";
myReq.Accept = "*/*";
myReq.KeepAlive = true;
myReq.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5");
HttpWebResponse result = (HttpWebResponse)myReq.GetResponse();
Stream receviceStream = result.GetResponseStream();
StreamReader readerOfStream = new StreamReader(receviceStream, System.Text.Encoding.GetEncoding(encoding));
string strHTML = readerOfStream.ReadToEnd();
readerOfStream.Close();
receviceStream.Close();
result.Close();
return strHTML;
}
/// <summary>
/// 获取HTML源码信息(Porschev)
/// </summary>
/// <param name="url">获取地址</param>
/// <returns>HTML源码</returns>
public static string GetHtmlCode(string url)
{
string str = "";
try
{
Uri uri = new Uri(url);
WebRequest wr = WebRequest.Create(uri);
Stream s = wr.GetResponse().GetResponseStream();
StreamReader sr = new StreamReader(s, Encoding.Default);
do
{
string strLine = "";
strLine = sr.ReadLine();// 读取一行字符并返回
str += strLine + "\r\n";
} while (!sr.EndOfStream);
}
catch (Exception e)
{
}
return str;
}
#endregion
#region 清除格式化html标记
///<summary>
///清除 获取到的 html 源码里面的所有标记
///</summary>
///<param name="Html">html 源码</param>
///<returns>已经去除后的字符串</returns>
public static string RemoveHtml(string Html)
{
//删除脚本
Html = Regex.Replace(Html, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);
//删除HTML
Regex regex = new Regex("<.+?>", RegexOptions.IgnoreCase);
Html = regex.Replace(Html, "");
Html = Regex.Replace(Html, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
Html = Regex.Replace(Html, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase);
Html = Regex.Replace(Html, @"-->", "", RegexOptions.IgnoreCase);
Html = Regex.Replace(Html, @"<!--.*", "", RegexOptions.IgnoreCase);
Html = Regex.Replace(Html, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase);
Html = Regex.Replace(Html, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
Html = Regex.Replace(Html, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
Html = Regex.Replace(Html, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
Html = Regex.Replace(Html, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase);
Html = Regex.Replace(Html, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);
Html = Regex.Replace(Html, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase);
Html = Regex.Replace(Html, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase);
Html = Regex.Replace(Html, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);
Html = Regex.Replace(Html, @"&#(\d+);", "", RegexOptions.IgnoreCase);
Html.Replace("<", "");
Html.Replace(">", "");
Html.Replace("\r\n", "");
return Html;
}
/// <summary>
/// 压缩获取到的 Html 字符串(删除换行字符串)
/// </summary>
/// <param name="Html">Html 源代码</param>
/// <returns></returns>
public static string ZipHtml(string Html)
{
Html = Regex.Replace(Html, @">\s+?<", "><");//去除HTML中的空白字符
Html = Regex.Replace(Html, @"\r\n\s*", "");
Html = Regex.Replace(Html, @"<body([\s|\S]*?)>([\s|\S]*?)</body>", @"<body$1>$2</body>", RegexOptions.IgnoreCase);
return Html;
}
/// <summary>
/// 格式化还原获取到的 Html 特殊符合代码(直接显示的html标记元素)
/// </summary>
/// <param name="Html">Html 源代码</param>
/// <returns></returns>
public static string FormatHtml(string Html)
{
Regex r;
Match m;
#region 处理空格
Html = Html.Replace(" ", " ");
#endregion
#region 处理单引号
Html = Html.Replace("'", "’");
#endregion
#region 处理双引号
Html = Html.Replace("\"", """);
#endregion
#region html标记符
Html = Html.Replace("<", "<");
Html = Html.Replace(">", ">");
#endregion
#region 处理换行
//处理换行,在每个新行的前面添加两个全角空格
r = new Regex(@"(\r\n(( )| )+)(?<正文>\S+)", RegexOptions.IgnoreCase);
for (m = r.Match(Html); m.Success; m = m.NextMatch())
{
Html = Html.Replace(m.Groups[0].ToString(), "<BR> " + m.Groups["正文"].ToString());
}
//处理换行,在每个新行的前面添加两个全角空格
Html = Html.Replace("\r\n", "<BR>");
#endregion
return Html;
}
/// <summary>
/// 除去所有在html元素中标记
/// </summary>
/// <param name="strhtml">Html 源代码</param>
/// <returns></returns>
public static string StripHtml(string strhtml)
{
string stroutput = strhtml;
Regex regex = new Regex(@"<[^>]+>|</[^>]+>");
stroutput = regex.Replace(stroutput, "");
return stroutput;
}
#endregion
#region 文本中字符的转换
/// <summary>
/// 将文本格式转换为html代码
/// </summary>
/// <param name="str">要格式化的字符串</param>
/// <returns>格式化后的字符串</returns>
public static String ToHtml(string str)
{
if (str == null || str.Equals(""))
{
return str;
}
StringBuilder sb = new StringBuilder(str);
sb.Replace("&", "&");
sb.Replace("<", "<");
sb.Replace(">", ">");
sb.Replace("\r\n", "<br>");
sb.Replace("\n", "<br>");
sb.Replace("\t", " ");
sb.Replace(" ", " ");
return sb.ToString();
}
/// <summary>
/// 将HTML代码转化成文本格式
/// </summary>
/// <param name="str">要格式化的字符串</param>
/// <returns>格式化后的字符串</returns>
public static String ToTxt(String str)
{
if (str == null || str.Equals(""))
{
return str;
}
StringBuilder sb = new StringBuilder(str);
sb.Replace(" ", " ");
sb.Replace("<br>", "\r\n");
sb.Replace("<", "<");
sb.Replace(">", ">");
sb.Replace("&", "&");
return sb.ToString();
}
#endregion
#region HTML特殊字符转换
/// <summary>
/// 替换html中的特殊字符
/// </summary>
/// <param name="theString">需要进行替换的文本。</param>
/// <returns>替换完的文本。</returns>
public static string HtmlEncode(string theString)
{
theString = theString.Replace(">", ">");
theString = theString.Replace("<", "<");
theString = theString.Replace(" ", " ");
theString = theString.Replace("\"", """);
theString = theString.Replace("'", "'");
theString = theString.Replace("\r\n", "<br/> ");
return theString;
}
/// <summary>
/// 恢复html中的特殊字符
/// </summary>
/// <param name="theString">需要恢复的文本。</param>
/// <returns>恢复好的文本。</returns>
public static string HtmlDecode(string theString)
{
theString = theString.Replace(">", ">");
theString = theString.Replace("<", "<");
theString = theString.Replace(" ", " ");
theString = theString.Replace(""", "\"");
theString = theString.Replace("'", "'");
theString = theString.Replace("<br/> ", "\r\n");
theString = theString.Replace("—", "—");//2012-05-07新加的
return theString;
}
#endregion
#region html中读取a标签的href值
/// <summary>
/// 正则表达式获取html超链接及对应链接里面的内容
/// </summary>
/// <param name="content">html 源代码</param>
/// <returns></returns>
public static Dictionary<string, string> GetUrl(string content)
{
Dictionary<string, string> dics = new Dictionary<string, string>();
string pattern = @"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>";
MatchCollection mc = Regex.Matches(content, pattern);
foreach (Match m in mc)
{
if (m.Success)
{
//加入集合数组
//hrefList.Add(m.Groups["href"].Value);
//nameList.Add(m.Groups["name"].Value);
try
{
dics.Add(m.Groups["url"].Value, m.Groups["text"].Value);
}
catch (Exception ex)
{
Console.WriteLine(ex.Message);
}
}
}
return dics;
}
#endregion
#region html中获取图片
/// <summary>
/// 取得HTML中首张图片的 URL
/// </summary>
/// <param name="sHtmlText">HTML代码</param>
/// <returns>图片的源地址列表</returns>
public static string getHtmlFirstImage(string sHtmlText)
{
// 定义正则表达式用来匹配 img 标签
Regex regImg = new Regex(@"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase);
// 搜索匹配的字符串
MatchCollection matches = regImg.Matches(sHtmlText);
string ImgStr = "";
// 取得匹配项列表
if (matches != null && matches.Count > 0)
{
for (int i = 0; i < matches.Count; i++)
{
string sUrl = matches[i].Groups["imgUrl"].Value.ToString();
if (sUrl != "")
{
ImgStr = sUrl;
break;
}
}
}
return ImgStr;
}
/// <summary>
/// 取得HTML中图片的列表,用“|”分割
/// </summary>
/// <param name="sHtmlText">HTML代码</param>
/// <returns>图片的源地址列表</returns>
public static string getHtmlImageList(string sHtmlText)
{
// 定义正则表达式用来匹配 img 标签
Regex regImg = new Regex(@"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase);
// 搜索匹配的字符串
MatchCollection matches = regImg.Matches(sHtmlText);
string ImgStr = "";
// 取得匹配项列表
if (matches != null && matches.Count > 0)
{
for (int i = 0; i < matches.Count; i++)
{
string sUrl = matches[i].Groups["imgUrl"].Value.ToString();
if (ImgStr != "")
{
ImgStr += "|";
}
ImgStr += sUrl;
}
}
return ImgStr;
}
/// <summary>
/// 取得HTML中所有图片src的源地址。
/// </summary>
/// <param name="sHtmlText">HTML代码</param>
/// <returns>src的源地址列表</returns>
public static ArrayList GetHtmlSrcUrlList(string sHtmlText)
{
// 定义正则表达式用来匹配 img 标签
//Regex regImg = new Regex(@"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase);
Regex regImg = new Regex(@" \b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]* ");
// 搜索匹配的字符串
MatchCollection matches = regImg.Matches(sHtmlText);
int i = 0;
ArrayList sUrlList = new ArrayList();
// 取得匹配项列表
foreach (Match match in matches)
{
if (!sUrlList.Contains(match.Groups["imgUrl"].Value)) sUrlList.Add(match.Groups["imgUrl"].Value);
}
return sUrlList;
}
/// <summary>
/// 格式化HTML中图片的img,宽度100%,高度100%,请加上链接<a href=show://。
/// </summary>
/// <param name="sHtmlText">HTML代码</param>
/// <param name="styleStr">HTML样式代码</param>
/// <returns>图片的源地址列表</returns>
public static string ClearHtmlImageHW(string sHtmlText, string styleStr)
{
// 定义正则表达式用来匹配 img 标签
Regex regImg = new Regex(@"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase);
// 搜索匹配的字符串
MatchCollection matches = regImg.Matches(sHtmlText);
// 取得匹配项列表
if (matches != null && matches.Count > 0)
{
for (int i = 0; i < matches.Count; i++)
{
string imgUrl = matches[i].ToString();
string sUrl = matches[i].ToString();
string nUrl = "";
if (sUrl.IndexOf("width") > -1 || sUrl.IndexOf("height") > -1 || sUrl.IndexOf("style") > -1)
{
nUrl = Regex.Replace(sUrl, @"(?<=<img[\s\S]*?)style=((['""])[^'""]*\2|\S+)(?=[^>]*>)", "", RegexOptions.IgnoreCase);
nUrl = Regex.Replace(nUrl, @"(?<=<img[\s\S]*?)width=((['""])[^'""]*\2|\S+)(?=[^>]*>)", "", RegexOptions.IgnoreCase);
nUrl = Regex.Replace(nUrl, @"(?<=<img[\s\S]*?)height=((['""])[^'""]*\2|\S+)(?=[^>]*>)", "", RegexOptions.IgnoreCase);
if (nUrl.IndexOf(" />") != -1)
{
nUrl = nUrl.Replace(" />", "");
nUrl += " " + styleStr + " />";
}
else if (nUrl.IndexOf("/>") != -1)
{
nUrl = nUrl.Replace("/>", "");
nUrl += " " + styleStr + " />";
}
else
{
nUrl = nUrl.Replace(">", "");
nUrl += " " + styleStr + " />";
}
sHtmlText = sHtmlText.Replace(sUrl, nUrl);
}
else
{
if (sUrl.IndexOf(" />") != -1)
{
nUrl = sUrl.Replace(" />", "");
nUrl += " " + styleStr + " />";
}
else if (sUrl.IndexOf("/>") != -1)
{
nUrl = sUrl.Replace("/>", "");
nUrl += " " + styleStr + " />";
}
else
{
nUrl = sUrl.Replace(">", "");
nUrl += " " + styleStr + " />";
}
sHtmlText = sHtmlText.Replace(sUrl, nUrl);
}
if (nUrl != "")
{
imgUrl = nUrl;
}
else
{
imgUrl = sUrl;
}
//sHtmlText = sHtmlText.Replace(imgUrl, "<a href=show://" + (i + 1) + ">" + imgUrl + "</a>");
}
}
return sHtmlText;
}
#endregion
#region 页面信息读取
/// <summary>
/// 获取请求地址的主机名称
/// </summary>
/// <returns></returns>
public static string Host(string url)
{
try
{
if (!url.ToLower().Contains("http://") && !url.ToLower().Contains("https://"))
{
url = "http://" + url;
}
Uri uri = new Uri(url);
return uri.Host;
}
catch (Exception)
{
return string.Empty;
}
}
/// <summary>
/// 获取网页标题
/// </summary>
/// <param name="html">html源</param>
/// <returns></returns>
public static string Title(string html)
{
string titleReg = "<title>([^<]+)</title>";
return MatchHelper.MatchScalar(html, titleReg);
}
/// <summary>
/// 根据主机名获取对于的IP
/// </summary>
/// <param name="host">url</param>
/// <returns>返回Url对应的IP地址</returns>
public static string Ip(string host)
{
try
{
IPHostEntry hostInfo = Dns.GetHostEntry(host);
return hostInfo.AddressList[0].ToString();
}
catch (Exception)
{
return string.Empty;
}
}
/// <summary>
/// 获取Url地址后面的参数键值集
/// </summary>
/// <param name="url">url</param>
/// <returns></returns>
public static NameValueCollection UrlParseQuery(string url)
{
try
{
return HttpUtility.ParseQueryString(url);
}
catch (Exception)
{
return null;
}
}
/// <summary>
/// Url解码
/// </summary>
/// <param name="url">url</param>
/// <returns></returns>
public static string UrlDecode(string url)
{
try
{
return HttpUtility.UrlDecode(url);
}
catch (Exception)
{
return url;
}
}
/// <summary>
/// Url编码
/// </summary>
/// <param name="url">url</param>
/// <returns></returns>
public static string UrlEncode(string url)
{
try
{
return HttpUtility.UrlEncode(url);
}
catch (Exception)
{
return url;
}
}
#endregion
#region 获得用户IP
/// <summary>
/// 获得用户IP
/// </summary>
public static string GetUserIp()
{
string ip;
string[] temp;
bool isErr = false;
if (System.Web.HttpContext.Current.Request.ServerVariables["HTTP_X_ForWARDED_For"] == null)
ip = System.Web.HttpContext.Current.Request.ServerVariables["REMOTE_ADDR"].ToString();
else
ip = System.Web.HttpContext.Current.Request.ServerVariables["HTTP_X_ForWARDED_For"].ToString();
if (ip.Length > 15)
isErr = true;
else
{
temp = ip.Split('.');
if (temp.Length == 4)
{
for (int i = 0; i < temp.Length; i++)
{
if (temp[i].Length > 3) isErr = true;
}
}
else
isErr = true;
}
if (isErr)
return "1.1.1.1";
else
return ip;
}
#endregion
#region 通过网络获取IP
private string url = "http://www.proxy360.cn/default.aspx";
private string url1 = "http://www.kuaidaili.com/";
/// <summary>
/// 获取代理IP集合
/// </summary>
public List<string> ProxyIP
{
get { return ProcessHtml(HtmlHelper.GetHtmlCode(url)); }
}
private List<string> ProcessHtml(string html)
{
try
{
List<string> list = new List<string>();
string regIP = "(\\d+.\\d+.\\d+.\\d+)\\s*</span>\\s*<span\\s*class=\"tbBottomLine\"\\s*style=\"width:50px;\">\\s*(\\d+)";
//string regIP = @"<td>(\d+.\d+.\d+.\d+)</td>\s*<td>(\d+)</td>"; 对应url1
DataTable dt = MatchHelper.MatchDt(html, regIP);
if (dt != null && dt.Rows.Count > 0)
{
for (int i = 0; i < dt.Rows.Count; i++)
{
string tempIP = dt.Rows[i][0].ToString() + ":" + dt.Rows[i][1].ToString();
list.Add(tempIP);
}
}
return list;
}
catch (Exception ee)
{
return null;
}
}
#endregion
#region 获取页面里面的链接信息
/// <summary>
/// 获取网页里的所有图片链接
/// </summary>
/// <param name="html">html源</param>
/// <param name="host">当前 html 源网址中的主机名</param>
/// <returns></returns>
public static List<ItemImg> ItemImg(string html, string host = "")
{
try
{
string imgReg = "(<img\\s*[^>]*\\s*>)";
List<ItemImg> ImgItem = new List<ItemImg>();
List<string> ImgList = MatchHelper.MatchLists(html, imgReg);
if (ImgList != null && ImgList.Count > 0)
{
string srcReg = "src=\"(\\S+)\"|src=\'(\\S+)\'|data-original=\"(\\S+)\"|data-original='(\\S+)'";
string altReg = "alt=\"(\\S+)\"|alt=\'(\\S+)\'";
for (int i = 0; i < ImgList.Count; i++)
{
string _src = MatchHelper.MatchScalar(ImgList[i], srcReg);
if (FilterUrl(_src))
{
ItemImg _imgitem = new ItemImg();
string _alt = MatchHelper.MatchScalar(ImgList[i], altReg);
_src = FilterSrcUrl(_src, host);
if (_src.ToLower().Contains("http://") || _src.ToLower().Contains("https://"))
{
Uri uri = new Uri(_src);
_imgitem.ImgHost = uri.Host;
}
_imgitem.ImgSrc = _src;
_imgitem.ImgAlt = _alt;
_imgitem.ImgLable = ImgList[i];
ImgItem.Add(_imgitem);
}
}
}
return ImgItem;
}
catch (Exception ee)
{
return null;
}
}
/// <summary>
/// 获取 html 源中所有 a 标签的链接信息
/// </summary>
/// <param name="html">html源</param>
/// <param name="host">当前 html 源网址中的主机名</param>
/// <returns></returns>
internal static List<ItemA> ItemA(string html, string host = "")
{
try
{
List<ItemA> Item = new List<ItemA>();
string aReg = "(<a\\s*[^<]*\\s*>\\s*[^<]*\\s*<\\s*/\\s*a\\s*>)";
List<string> aList = MatchHelper.MatchLists(html, aReg);
if (aList != null && aList.Count > 0)
{
string hrefReg = "href=\"(\\S+)\"|href='(\\S+)'";
string title = "title=\"(\\S+)\"|title=\'(\\S+)\'";
string titleShow = ">([^<]+)<";
for (int i = 0; i < aList.Count; i++)
{
string _url = MatchHelper.MatchScalar(aList[i], hrefReg).Replace("\"", "").Replace("'", "");
if (FilterUrl(_url))
{
ItemA _aitem = new ItemA();
string _title = MatchHelper.MatchScalar(aList[i], title);
string _content = MatchHelper.MatchScalar(aList[i], titleShow);
_url = FilterSrcUrl(_url, host);
if (_url.ToLower().Contains("http://") || _url.ToLower().Contains("https://"))
{
Uri uri = new Uri(_url);
_aitem.AHost = uri.Host;
}
_aitem.Ahref = _url;
_aitem.ATitle = _title;
_aitem.AContent = _content;
_aitem.ALable = aList[i];
Item.Add(_aitem);
}
}
}
return Item;
}
catch (Exception ee)
{
return null;
}
}
/// <summary>
/// 私有函数,过滤不合法的url
/// </summary>
/// <param name="url">待判别的url</param>
/// <returns></returns>
static bool FilterUrl(string url)
{
bool ok = true;
if (url == "") ok = false;
if (url.StartsWith("javascript:")) ok = false;
if (url.StartsWith("#")) ok = false;
return ok;
}
/// <summary>
/// 判断url格式是否标准,不标准则将其标准话
/// </summary>
/// <param name="srcUrl">待判别的url</param>
/// <param name="host">当前 html 源网址中的主机名</param>
/// <returns></returns>
static string FilterSrcUrl(string srcUrl, string host)
{
if (!srcUrl.ToLower().Contains("http://") && !srcUrl.ToLower().Contains("https://"))
{
if (host.EndsWith("/") && srcUrl.StartsWith("/"))
{
srcUrl = host + srcUrl;
srcUrl = srcUrl.Replace("//", "/");
}
else if (!host.EndsWith("/") && !srcUrl.StartsWith("/"))
srcUrl = host + "/" + srcUrl;
else
srcUrl = host + srcUrl;
srcUrl = "http://" + srcUrl;
}
return srcUrl;
}
/// <summary>
/// 获取 html 源中的图片链接,非img标签中的链接
/// </summary>
/// <param name="html">html源</param>
/// <param name="host">当前 html 源网址中的主机名</param>
/// <param name="type">
/// <para>图片类型,可填</para>
/// <para>1.jpg</para>
/// <para>2.png</para>
/// <para>3.bmp</para>
/// <para>4.gif</para>
/// <para>5.其他</para>
/// <para>注意大小写,有可能因为大小写而导致无法匹配</para>
/// </param>
/// <returns></returns>
public static List<string> ListImg(string html, string host = "", string type = "jpg")
{
try
{
string picReg = "[\"|']([-a-zA-Z0-9@:%_\\+.~#?&//=]+." + type + ")[\"|']";
List<string> picList = MatchHelper.MatchLists(html, picReg);
if (picList != null && picList.Count > 0)
{
if (host != "" && (host.Contains("http://") || host.Contains("https://")))
{
for (int i = 0; i < picList.Count; i++)
{
if (!picList[i].Contains("http://") && !picList[i].Contains("https://"))
{
picList[i] = host + picList[i];
}
}
}
}
return picList;
}
catch (Exception ee)
{
return null;
}
}
#endregion
#region 页面源代码读取
/// <summary>
/// get方式读取数据
/// </summary>
/// <param name="strUrl">地址</param>
/// <returns>返回数据</returns>
public static string GetModel(string strUrl)
{
string strRet = null;
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(strUrl);
request.Timeout = 2000;
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
System.IO.Stream resStream = response.GetResponseStream();
Encoding encode = System.Text.Encoding.UTF8;
StreamReader readStream = new StreamReader(resStream, encode);
Char[] read = new Char[256];
int count = readStream.Read(read, 0, 256);
while (count > 0)
{
String str = new String(read, 0, count);
strRet = strRet + str;
count = readStream.Read(read, 0, 256);
}
resStream.Close();
}
catch (Exception e)
{
strRet = "";
}
return strRet;
}
/// <summary>
/// 提供通过POST方法获取页面的方法
/// </summary>
/// <param name="urlString">请求的URL</param>
/// <param name="encoding">页面使用的编码</param>
/// <param name="postDataString">POST数据</param>
/// <param name="Method">Method方式</param>
/// <returns>获取的页面</returns>
public static string GetHtmlFromPost(string urlString, Encoding encoding, string postDataString)
{
//定义局部变量
CookieContainer cookieContainer = new CookieContainer();
HttpWebRequest httpWebRequest = null;
HttpWebResponse httpWebResponse = null;
Stream inputStream = null;
Stream outputStream = null;
StreamReader streamReader = null;
string htmlString = string.Empty;
//转换POST数据
byte[] postDataByte = encoding.GetBytes(postDataString);
//建立页面请求
try
{
httpWebRequest = WebRequest.Create(urlString) as HttpWebRequest;
}
//处理异常
catch (Exception ex)
{
//throw new Exception("建立页面请求时发生错误!", ex);
}
//指定请求处理方式
httpWebRequest.Method = "POST";
httpWebRequest.KeepAlive = false;
httpWebRequest.ContentType = "application/x-www-form-urlencoded";
httpWebRequest.CookieContainer = cookieContainer;
httpWebRequest.ContentLength = postDataByte.Length;
//向服务器传送数据
try
{
inputStream = httpWebRequest.GetRequestStream();
inputStream.Write(postDataByte, 0, postDataByte.Length);
}
//处理异常
catch (Exception ex)
{
//throw new Exception("发送POST数据时发生错误!", ex);
}
finally
{
inputStream.Close();
}
//接受服务器返回信息
try
{
httpWebResponse = httpWebRequest.GetResponse() as HttpWebResponse;
outputStream = httpWebResponse.GetResponseStream();
streamReader = new StreamReader(outputStream, encoding);
htmlString = streamReader.ReadToEnd();
}
//处理异常
catch (Exception ex)
{
//throw new Exception("接受服务器返回页面时发生错误!", ex);
}
finally
{
if (streamReader != null)
{
streamReader.Close();
}
}
if (httpWebResponse != null)
{
foreach (Cookie cookie in httpWebResponse.Cookies)
{
cookieContainer.Add(cookie);
}
}
return htmlString;
}
/// <summary>
/// 通过GET方式获取页面的方法
/// </summary>
/// <param name="urlString">请求的URL</param>
/// <param name="encoding">页面编码</param>
/// <returns></returns>
public static string GetHtmlFromGet(string urlString, Encoding encoding)
{
//定义局部变量
HttpWebRequest httpWebRequest = null;
HttpWebResponse httpWebRespones = null;
Stream stream = null;
string htmlString = string.Empty;
//请求页面
try
{
httpWebRequest = WebRequest.Create(urlString) as HttpWebRequest;
}
//处理异常
catch (Exception ex)
{
//throw new Exception("建立页面请求时发生错误!", ex);
}
httpWebRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 2.0.50727; Maxthon 2.0)";
//获取服务器的返回信息
try
{
httpWebRespones = (HttpWebResponse)httpWebRequest.GetResponse();
stream = httpWebRespones.GetResponseStream();
}
//处理异常
catch (Exception ex)
{
//throw new Exception("接受服务器返回页面时发生错误!", ex);
}
StreamReader streamReader = new StreamReader(stream, encoding);
//读取返回页面
try
{
htmlString = streamReader.ReadToEnd();
}
//处理异常
catch (Exception ex)
{
//throw new Exception("读取页面数据时发生错误!", ex);
}
//释放资源返回结果
streamReader.Close();
stream.Close();
return htmlString;
}
#endregion
#region 从QueryString截取参数
/// <summary>
/// 截取参数,取不到值时返回""
/// </summary>
/// <param name="s">不带?号的url参数</param>
/// <param name="para">要取的参数</param>
public static string QueryString(string s, string para)
{
if (string.IsNullOrEmpty(s))
{
return s;
}
s = s.Trim('?').Replace("%26", "&").Replace('?', '&');
int num = s.Length;
for (int i = 0; i < num; i++)
{
int startIndex = i;
int num4 = -1;
while (i < num)
{
char ch = s[i];
if (ch == '=')
{
if (num4 < 0)
{
num4 = i;
}
}
else if (ch == '&')
{
break;
}
i++;
}
string str = null;
string str2 = null;
if (num4 >= 0)
{
str = s.Substring(startIndex, num4 - startIndex);
str2 = s.Substring(num4 + 1, (i - num4) - 1);
if (str == para)
{
return System.Web.HttpUtility.UrlDecode(str2);
}
}
}
return "";
}
#endregion
#region 模拟页面请求地址(可以使用在上传文件上)
/// <summary>
/// 同步方式发起http post请求,可以同时上传文件
/// </summary>
/// <param name="url">请求URL</param>
/// <param name="queryString">请求参数字符串</param>
/// <param name="files">上传文件列表</param>
/// <returns>请求返回值</returns>
public static string HttpPostWithFile(string url, string queryString, List<QueryParameter> files)
{
Stream requestStream = null;
string responseData = null;
string boundary = DateTime.Now.Ticks.ToString("x");
HttpWebRequest webRequest = WebRequest.Create(url) as HttpWebRequest;
webRequest.ServicePoint.Expect100Continue = false;
webRequest.Timeout = 20000;
webRequest.ContentType = "multipart/form-data;charset=utf-8;boundary=" + boundary;
webRequest.Method = "POST";
webRequest.KeepAlive = false;
webRequest.Credentials = CredentialCache.DefaultCredentials;
try
{
Stream memStream = new MemoryStream();
byte[] beginBoundary = Encoding.UTF8.GetBytes("\r\n--" + boundary + "\r\n");
byte[] endBoundary = Encoding.UTF8.GetBytes("\r\n--" + boundary + "--\r\n");
// byte[] boundarybytes = System.Text.Encoding.ASCII.GetBytes("\r\n--" + boundary + "\r\n");
// string formdataTemplate = "\r\n--" + boundary + "\r\nContent-Disposition: form-data; name=\"{0}\"\r\n\r\n{1}";
string formdataTemplate = "Content-Disposition: form-data; name=\"{0}\"\r\n\r\n{1}";
List<QueryParameter> listParams = GetQueryParameters(queryString);
foreach (QueryParameter param in listParams)
{
// 写入头
memStream.Write(beginBoundary, 0, beginBoundary.Length);
string formitem = string.Format(formdataTemplate, param.Name, param.Value);
byte[] formitembytes = Encoding.UTF8.GetBytes(formitem);
memStream.Write(formitembytes, 0, formitembytes.Length);
}
// memStream.Write(boundarybytes, 0, boundarybytes.Length);
string headerTemplate = "Content-Disposition: form-data; name=\"{0}\"; filename=\"{1}\"\r\nContent-Type: \"{2}\"\r\n\r\n";
foreach (QueryParameter param in files)
{
string name = param.Name;
string filePath = param.Value;
string file = Path.GetFileName(filePath);
string contentType = GetContentType(file);
// 写入头
memStream.Write(beginBoundary, 0, beginBoundary.Length);
string header = string.Format(headerTemplate, name, file, contentType);
byte[] headerbytes = System.Text.Encoding.UTF8.GetBytes(header);
memStream.Write(headerbytes, 0, headerbytes.Length);
FileStream fileStream = new FileStream(filePath, FileMode.Open, FileAccess.Read);
byte[] buffer = new byte[1024];
int bytesRead = 0;
while ((bytesRead = fileStream.Read(buffer, 0, buffer.Length)) != 0)
{
memStream.Write(buffer, 0, bytesRead);
}
// memStream.Write(boundarybytes, 0, boundarybytes.Length);
// 写入结尾
memStream.Write(endBoundary, 0, endBoundary.Length);
fileStream.Close();
}
webRequest.ContentLength = memStream.Length;
requestStream = webRequest.GetRequestStream();
memStream.Position = 0;
byte[] tempBuffer = new byte[memStream.Length];
memStream.Read(tempBuffer, 0, tempBuffer.Length);
memStream.Close();
requestStream.Write(tempBuffer, 0, tempBuffer.Length);
}
catch
{
throw;
}
finally
{
requestStream.Close();
requestStream = null;
}
try
{
responseData = WebResponseGet(webRequest);
webRequest = null;
return responseData;
}
catch (Exception ex)
{
throw ex;
}
}
/// <summary>
/// 获取返回结果http get请求
/// </summary>
/// <param name="webRequest">webRequest对象</param>
/// <returns>请求返回值</returns>
public static string WebResponseGet(HttpWebRequest webRequest)
{
try
{
HttpWebResponse httpWebResponse = (HttpWebResponse)webRequest.GetResponse();
StreamReader responseReader = null;
string responseData = String.Empty;
responseReader = new StreamReader(webRequest.GetResponse().GetResponseStream());
responseData = responseReader.ReadToEnd();
webRequest.GetResponse().GetResponseStream().Close();
responseReader.Close();
responseReader = null;
return responseData;
}
catch (Exception ex)
{
throw ex;
}
}
/// <summary>
/// ParseQueryString
/// </summary>
/// <param name="strValue"></param>
/// <returns></returns>
public static List<QueryParameter> GetQueryParameters(string strValue)
{
List<QueryParameter> list = new List<QueryParameter>();
if (!string.IsNullOrEmpty(strValue))
{
foreach (var item in strValue.Trim(' ', '?', '&').Split('&'))
{
if (item.IndexOf('=') > -1)
{
var temp = item.Split('=');
list.Add(new QueryParameter(temp[0], temp[1]));
}
else
{
list.Add(new QueryParameter(item, string.Empty));
}
}
}
return list;
}
/// <summary>
/// 字符串拼接
/// </summary>
/// <param name="paras"></param>
/// <returns></returns>
public static string GetQueryFromParas(List<QueryParameter> paras)
{
if (paras == null || paras.Count == 0)
return "";
StringBuilder sbList = new StringBuilder();
int count = 1;
foreach (QueryParameter para in paras)
{
sbList.AppendFormat("{0}={1}", para.Name, para.Value);
if (count < paras.Count)
{
sbList.Append("&");
}
count++;
}
return sbList.ToString(); ;
}
/// <summary>
/// 根据文件名获取文件类型
/// </summary>
/// <param name="fileName"></param>
/// <returns></returns>
public static string GetContentType(string fileName)
{
string contentType = "application/octetstream";
string ext = Path.GetExtension(fileName).ToLower();
RegistryKey registryKey = Registry.ClassesRoot.OpenSubKey(ext);
if (registryKey != null && registryKey.GetValue("Content Type") != null)
{
contentType = registryKey.GetValue("Content Type").ToString();
}
return contentType;
}
/// <summary>
/// Utc时间转本地时间,原格式:Wed Nov 17 15:07:48 +0800 2010
/// </summary>
/// <param name="strValue">原格式:Wed Nov 17 15:07:48 +0800 2010</param>
/// <returns></returns>
public static string UtcToDateTime(string strValue)
{
if (!string.IsNullOrEmpty(strValue))
{
//原格式:Wed Nov 17 15:07:48 +0800 2010
string[] str = strValue.Split(' ');
//转格式:Wed Nov 17 2010 15:07:48
return str[0] + " " + str[1] + " " + str[2] + " " + str[5] + " " + str[3];
}
else
{
return "";
}
}
#endregion
}
public class ItemA
{
/// <summary>
/// a 标签中的链接
/// </summary>
public string Ahref { get; set; }
/// <summary>
/// a 标签中的标题
/// </summary>
public string ATitle { get; set; }
/// <summary>
/// a 标签中的内容
/// </summary>
public string AContent { get; set; }
/// <summary>
/// a 标签中的链接的主机名
/// </summary>
public string AHost { get; set; }
/// <summary>
/// a 标签
/// </summary>
public string ALable { get; set; }
}
public class ItemImg
{
/// <summary>
/// Img 标签中的链接
/// </summary>
public string ImgSrc { get; set; }
/// <summary>
/// Img 标签中的替代文本
/// </summary>
public string ImgAlt { get; set; }
/// <summary>
/// Img 标签链接主机名
/// </summary>
public string ImgHost { get; set; }
/// <summary>
/// Img 标签
/// </summary>
public string ImgLable { get; set; }
}
/// <summary>
/// QueryParameter
/// </summary>
public class QueryParameter
{
private string name = string.Empty;
private string value = string.Empty;
public QueryParameter(string name, string value)
{
this.name = name;
this.value = value;
}
public QueryParameter(string name, object value)
{
this.name = name;
this.value = value.ToString();
}
public string Name
{
get { return name == null ? string.Empty : name.Trim(); }
}
public string Value
{
get { return value == null ? string.Empty : value.Trim(); }
}
}