CollectionHelper-网页采集辅助类代码,CSharp代码库,德仔网

代码语言
代码分类
【CSharp】 CollectionHelper-网页采集辅助类
作者:汉妤 / 发布于2016/1/4/ 1625

	using System;
	using System.Collections.Generic;
	using System.Linq;
	using System.Text;
	using System.Text.RegularExpressions;
	using System.IO;
	using System.Net;
	
	namespace Framework
	{
	 /// <summary>
	 /// 网页采集辅助类
	 /// </summary>
	 public static class CollectionHelper
	 {
	 /// <summary>
	 /// 取得字符里的Dom元素 不包含元素属性
	 /// </summary>
	 /// <param name="orgStr"></param>
	 /// <param name="domElem"></param>
	 /// <returns></returns>
	 public static List<string> GetDomElem(string orgStr, string domElem)
	 {
	 List<string> matchList = new List<string>();
	 string regStr = string.Format("<{0}[^>]*?>[\\s\\S]+?<\\/{0}>", domElem);
	 try
	 {
	 Regex regex = new Regex(regStr, RegexOptions.Compiled | RegexOptions.IgnoreCase);
	 MatchCollection matches = regex.Matches(orgStr);
	 StringBuilder sb = new StringBuilder();
	 foreach (Match match in matches)
	 {
	 matchList.Add(match.Value);
	 }
	 }
	 catch (Exception ex)
	 {
	 matchList.Add(ex.Message);
	 }
	 return matchList;
	 }
	
	 /// <summary>
	 /// 取得字符里的Dom元素 包含元素属性 如：class="aa"
	 /// </summary>
	 /// <param name="orgStr"></param>
	 /// <param name="tagName"></param>
	 /// <param name="tagValue"></param>
	 /// <returns></returns>
	 public static List<string> GetDomElemByAttr(string orgStr, string tagName, string tagValue)
	 {
	 List<string> matchList = new List<string>();
	 string regStr = string.Format(@"<(?<HtmlTag>[\w]+)[^>]*\s{0}=(?<Quote>[""']?){1}(?(Quote)\k<Quote>)[""']?[^>]*>((?<Nested><\k<HtmlTag>[^>]*>)|</\k<HtmlTag>>(?<-Nested>)|[\s\S]*?)*</\k<HtmlTag>>", tagName.ToLower(), tagValue);
	 try
	 {
	 Regex regex = new Regex(regStr, RegexOptions.Compiled | RegexOptions.IgnoreCase);
	 MatchCollection matches = regex.Matches(orgStr);
	 StringBuilder sb = new StringBuilder();
	 foreach (Match match in matches)
	 {
	 matchList.Add(match.Value);
	 }
	 }
	 catch (Exception ex)
	 {
	 matchList.Add(ex.Message);
	 }
	 return matchList;
	 }
	
	 /// <summary>
	 /// 取得字符里的A元素键值对 [name，url]
	 /// </summary>
	 /// <param name="orgStr"></param>
	 /// <param name="domElem"></param>
	 /// <returns></returns>
	 public static Dictionary<string, string> GetDomElem_A(string orgStr)
	 {
	 Dictionary<string, string> matchList = new Dictionary<string, string>();
	 string regStr1 = "<a[^>]*? href=[\"'](?<url>[^\"']*?)[\"'][^>]*?>(?<text>[\\w\\W]*?)</a>";
	 try
	 {
	 Regex regex = new Regex(regStr1, RegexOptions.Compiled | RegexOptions.IgnoreCase);
	 MatchCollection matches = regex.Matches(orgStr);
	 StringBuilder sb = new StringBuilder();
	 foreach (Match match in matches)
	 {
	 string key = match.Value.RemoveHtml();
	 if (!matchList.ContainsKey(key))
	 {
	 matchList.Add(key, GetUrlArray(match.Value)[0]);
	 }
	 }
	 }
	 catch (Exception ex)
	 {
	 matchList.Add(ex.Message, "");
	 }
	 return matchList;
	 }
	
	
	 /// <summary>
	 /// 获取网页源码
	 /// </summary>
	 /// <param name="url">要获取源码的网页地址</param>
	 /// <param name="coding">编码</param>
	 /// <returns>返回获取的网页源代码</returns>
	 public static string GetPageSourceByUrl(string url, string coding = "gb2312")
	 {
	 return GetPageSourceByUrl(new Uri(url), coding);
	 }
	
	 /// <summary>
	 /// 获取网页源码
	 /// </summary>
	 /// <param name="url">要获取源码的网页地址</param>
	 /// <param name="coding">编码</param>
	 /// <returns>返回获取的网页源代码</returns>
	 public static string GetPageSourceByUrl(Uri url, string coding = "gb2312")
	 {
	 string getSource = string.Empty;
	 try
	 {
	 HttpWebRequest httpwebrequest = (HttpWebRequest)WebRequest.Create(url);
	 HttpWebResponse httpwebresponse = (HttpWebResponse)httpwebrequest.GetResponse();
	 Stream stream = httpwebresponse.GetResponseStream();
	 StreamReader streamreader = new StreamReader(stream, Encoding.GetEncoding(coding));
	 getSource = streamreader.ReadToEnd();
	 stream.Close();
	 httpwebresponse.Close();
	 }
	 catch (NotSupportedException exception)
	 {
	 getSource = exception.Message;
	 }
	 catch (InvalidOperationException exception)
	 {
	 getSource = exception.Message;
	 }
	 catch (IOException exception)
	 {
	 getSource = exception.Message;
	 }
	 return getSource;
	 }
	
	 /// <summary>
	 /// 获取页面内容后，用匹配url正则表达式抓取内容中的url
	 /// </summary>
	 /// <param name="code">列表代码</param>
	 /// <returns>返回截取后的URL地址</returns>
	 public static List<string> GetUrlArray(string code)
	 {
	 List<string> urlList = new List<string>();
	 Regex regex = new Regex(@"(http://)?[\w-\.]*([\/]?[\w-])+[\w-]*\.(htm|html|shtm|shtml|aspx|asp|php|jsp)+[\w-\=\?]*", RegexOptions.Compiled | RegexOptions.IgnoreCase);
	 MatchCollection matches = regex.Matches(code);
	 foreach (Match match in matches)
	 {
	 urlList.Add(match.Value);
	 }
	 return urlList;
	 }
	 /// <summary>
	 /// 获取内容code中所有都图片地址
	 /// </summary>
	 /// <param name="code">内容</param>
	 /// <returns>返回截取后都图片地址</returns>
	 public static Dictionary<string, string> GetImgUrlArray(string content)
	 {
	 Dictionary<string, string> imgList = new Dictionary<string, string>();
	 Regex reg = new Regex(@"<img[\s\S]*?src=(""(?<src>[^']*?)""|'(?<src>[^']*?)'|(?<src>[^>\s]*))[^>]*?>(.*?)");
	 MatchCollection m = reg.Matches(content.ToLower());
	 foreach (Match match in m)
	 {
	 string matchValue = match.Groups["src"].Value;
	 if (!imgList.ContainsKey(matchValue))
	 {
	 imgList.Add(matchValue, matchValue);
	 }
	 }
	 return imgList;
	 }
	
	 /// <summary>
	 /// 将相对地址转换为绝对地址
	 /// </summary>
	 /// <param name="relativeAddress">要转换的相对地址</param>
	 /// <param name="absoluteAddress">当前网页地址</param>
	 /// <returns>返回转换后的地址</returns>
	 public static string ConvertToAbsluteUrl(string relativeAddress, string absoluteAddress)
	 {
	 if (string.IsNullOrEmpty(relativeAddress))
	 {
	 return string.Empty;
	 }
	 if (relativeAddress.Contains("://"))
	 {
	 return relativeAddress;
	 }
	 if (string.IsNullOrEmpty(absoluteAddress))
	 {
	 return string.Empty;
	 }
	 if (!absoluteAddress.Contains("://"))
	 {
	 return string.Empty;
	 }
	 Uri baseUrl = new Uri(absoluteAddress);
	 Uri webrul = new Uri(baseUrl, relativeAddress);
	 return webrul.ToString();
	 }
	 /// <summary>
	 /// 替换所有HTML标签为空
	 /// </summary>
	 /// <param name="input">The string whose values should be replaced.</param>
	 /// <returns>A string.</returns>
	 public static string RemoveHtml(this string input)
	 {
	 var stripTags = new Regex("</?[a-z][^<>]*>", RegexOptions.IgnoreCase);
	 return stripTags.Replace(input, string.Empty);
	 }
	 }
	}
试试其它关键字
　CollectionHelper　　网页采集　
同语言下
可能有用的
汉妤贡献的其它代码(20)