`
380071587
  • 浏览: 478891 次
  • 性别: Icon_minigender_1
  • 来自: 上海
社区版块
存档分类
最新评论

CollectionHelper-网页采集辅助类

 
阅读更多
using System;
using System.Collections.Generic;
using System.IO;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;

namespace Helper
{
    /// <summary>
    ///     网页采集辅助类
    /// </summary>
    public static class CollectionHelper
    {
        /// <summary>
        ///     取得字符里的Dom元素 不包含元素属性
        /// </summary>
        /// <param name="source"></param>
        /// <param name="domElem"></param>
        /// <returns></returns>
        public static List<string> GetDomElem(string source, string domElem)
        {
            var matchList = new List<string>();
            string regStr = string.Format("<{0}[^>]*?>[\\s\\S]+?<\\/{0}>", domElem);
            try
            {
                var regex = new Regex(regStr, RegexOptions.Compiled | RegexOptions.IgnoreCase);
                MatchCollection matches = regex.Matches(source);
                foreach (Match match in matches)
                {
                    matchList.Add(match.Value);
                }
            }
            catch (Exception ex)
            {
                matchList.Add(ex.Message);
            }
            return matchList;
        }

        /// <summary>
        ///     取得字符里的Dom元素 包含元素属性 如:class="aa"
        /// </summary>
        /// <param name="source"></param>
        /// <param name="tagName"></param>
        /// <param name="tagValue"></param>
        /// <returns></returns>
        public static List<string> GetDomElemByAttr(string source, string tagName, string tagValue)
        {
            var matchList = new List<string>();
            string regStr =
                string.Format(
                    @"<(?<HtmlTag>[\w]+)[^>]*\s{0}[\s]*?=[\s]*?(?<Quote>[""']?){1}(?(Quote)\k<Quote>)[""']?[^>]*>((?<Nested><\k<HtmlTag>[^>]*>)|</\k<HtmlTag>>(?<-Nested>)|[\s\S]*?)*</\k<HtmlTag>>",
                    tagName.ToLower(), tagValue);
            try
            {
                var regex = new Regex(regStr, RegexOptions.Compiled | RegexOptions.IgnoreCase);
                MatchCollection matches = regex.Matches(source);
                foreach (Match match in matches)
                {
                    matchList.Add(match.Value);
                }
            }
            catch (Exception ex)
            {
                matchList.Add(ex.Message);
            }
            return matchList;
        }

        /// <summary>
        ///     取得字符里的A元素键值对  [name,url]
        /// </summary>
        /// <param name="source"></param>
        /// <returns></returns>
        public static Dictionary<string, string> GetDomElem_A(string source)
        {
            var matchList = new Dictionary<string, string>();
            const string pattern = "<a[^>]*? href=[\"'](?<url>[^\"']*?)[\"'][^>]*?>(?<text>[\\w\\W]*?)</a>";
            try
            {
                var regex = new Regex(pattern, RegexOptions.Compiled | RegexOptions.IgnoreCase);
                MatchCollection matches = regex.Matches(source);

                foreach (Match match in matches)
                {
                    string key = RemoveHtml(match.Value);
                    if (!matchList.ContainsKey(key))
                    {
                        matchList.Add(key, GetUrlArray(match.Value)[0]);
                    }
                }
            }
            catch (Exception ex)
            {
                matchList.Add(ex.Message, "");
            }
            return matchList;
        }


        /// <summary>
        ///     获取页面内容后,用匹配url正则表达式抓取内容中的url
        /// </summary>
        /// <param name="code">列表代码</param>
        /// <returns>返回截取后的URL地址</returns>
        public static List<string> GetUrlArray(string code)
        {
            var urlList = new List<string>();
            var regex =
                new Regex(@"(http://)?[\w-\.]*([\/]?[\w-])+[\w-]*\.(htm|html|shtm|shtml|aspx|asp|php|jsp)+[\w-\=\?]*",
                          RegexOptions.Compiled | RegexOptions.IgnoreCase);
            MatchCollection matches = regex.Matches(code);
            foreach (Match match in matches)
            {
                urlList.Add(match.Value);
            }
            return urlList;
        }

        /// <summary>
        ///     获取内容code中所有都图片地址
        /// </summary>
        /// <returns>返回截取后都图片地址</returns>
        public static Dictionary<string, string> GetImgUrlArray(string content)
        {
            var imgList = new Dictionary<string, string>();
            var reg = new Regex(@"<img[\s\S]*?src=(""(?<src>[^']*?)""|'(?<src>[^']*?)'|(?<src>[^>\s]*))[^>]*?>(.*?)");
            MatchCollection m = reg.Matches(content.ToLower());
            foreach (Match match in m)
            {
                string matchValue = match.Groups["src"].Value;
                if (!imgList.ContainsKey(matchValue))
                {
                    imgList.Add(matchValue, matchValue);
                }
            }
            return imgList;
        }

        /// <summary>
        ///     将相对地址转换为绝对地址
        /// </summary>
        /// <param name="relativeAddress">要转换的相对地址</param>
        /// <param name="absoluteAddress">当前网页地址</param>
        /// <returns>返回转换后的地址</returns>
        public static string ConvertToAbsluteUrl(string relativeAddress, string absoluteAddress)
        {
            if (string.IsNullOrEmpty(relativeAddress))
            {
                return string.Empty;
            }
            if (relativeAddress.Contains("://"))
            {
                return relativeAddress;
            }
            if (string.IsNullOrEmpty(absoluteAddress))
            {
                return string.Empty;
            }
            if (!absoluteAddress.Contains("://"))
            {
                return string.Empty;
            }
            var baseUrl = new Uri(absoluteAddress);
            var webrul = new Uri(baseUrl, relativeAddress);
            return webrul.ToString();
        }

        /// <summary>
        ///     替换所有HTML标签为空
        /// </summary>
        /// <param name="input">The string whose values should be replaced.</param>
        /// <returns>A string.</returns>
        public static string RemoveHtml(string input)
        {
            var stripTags = new Regex("</?[a-z][^<>]*>", RegexOptions.IgnoreCase);
            return stripTags.Replace(input, string.Empty);
        }

        /// <summary>
        ///     移除字符串中的空格及换行符
        /// </summary>
        /// <param name="input">The string whose values should be replaced.</param>
        /// <returns>A string.</returns>
        public static string RemoveBlank(string input)
        {
            input = input.Replace("\r", string.Empty);
            input = input.Replace("\n", string.Empty);
            input = input.Replace(" ", string.Empty);
            return input;
        }

        // 获取网页的HTML内容,根据网页的charset自动判断Encoding  
        public static string GetHtml(string url)
        {
            return GetHtml(url, null);
        }

        // 获取网页的HTML内容,指定Encoding  
        private static string GetHtml(string url, Encoding encoding)
        {
            string getSource;
            try
            {
                byte[] buf = new WebClient().DownloadData(url);
                if (encoding != null) return encoding.GetString(buf);
                string html = Encoding.UTF8.GetString(buf);
                encoding = GetEncoding(html);
                if (encoding == null || (Equals(encoding, Encoding.UTF8))) return html;
                getSource = encoding.GetString(buf);
            }
            catch (NotSupportedException exception)
            {
                getSource = exception.Message;
            }
            catch (InvalidOperationException exception)
            {
                getSource = exception.Message;
            }
            catch (IOException exception)
            {
                getSource = exception.Message;
            }
            return getSource;
        }

        /// <summary>
        ///     根据网页的HTML内容提取网页的Encoding
        /// </summary>
        /// <param name="html"></param>
        /// <returns></returns>
        private static Encoding GetEncoding(string html)
        {
            const string pattern = @"(?i)\bcharset=(?<charset>[-a-zA-Z_0-9]+)";
            string charset = Regex.Match(html, pattern).Groups["charset"].Value;
            try
            {
                return Encoding.GetEncoding(charset);
            }
            catch (ArgumentException)
            {
                return null;
            }
        }
    }
}

分享到:
评论

相关推荐

    MongoDB驱动1.7版本帮助类包括案例方法

    在1.7版本中,帮助类可能包括`MongoClientHelper`、`CollectionHelper`等,它们提供了如连接数据库、获取集合、创建索引等功能。例如,`MongoClientHelper`可能有`connect()`方法来建立数据库连接,而`...

    JAVA 工具类 toolkit

    对常用的处理封装 JsonResult、PageBean、exception、excel、FtpHelper、HttpHelper、AESHelper、DESHelper、RSAHelper、ChineseUtil、ClassUtil、CollectionHelper、ConvertUtil、DateUtil、FileUtil、ImageUtil、...

    WPF应用程序框架(WAF)v2.5.0.7源码2012825

    PropertyChanged:提供了一个辅助方法来测试如果一个属性改变事件是当一个特定的行动提出被执行。 CanExecuteChangedEvent:一个helper方法来测试一个CanExecute改变事件是当一个特定的行动提出被执行。 v2507更新...

    Json、Webservice、Jquery、Ajax

    版本号:v1.0 创建人:王国胜 版本创建日期:2011-03-30... CollectionHelper.cs List与DataTable相互转换类 JsonAndDateTable.cs Json与DataTable相互转换类 Default.aspx Json、Webservice、Jquery、Ajax测试页

    Tup.Utilities:一组 C# 工具助手类(C # tools a set of helper classes)

    CollectionHelper 集合处理 工具类 DateTimeHelper 时间操作 工具类 BatchHelper 批量执行动作 工具类 FieldHelper DataTable/IDataReader 数据字段 工具类 RetryHelper 重试 工具类 JsonHelper Newtonsoft.Json ...

    基于Java的简单WebUI项目.zip

    - **CollectionHelper.java**: 提供了一系列静态方法,用于对集合(数组或列表)进行常见的操作,如分组、计数、筛选和映射。 ### 2. 数据库连接与操作 - **Info.java**: 存储数据库连接的相关信息,如地址、...

    SparkGraphx计算指定节点的N度关系节点

    import horizon.graphx.util.CollectionUtil.CollectionHelper import org.apache.spark.graphx._ import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import scala.collection.mutable...

Global site tag (gtag.js) - Google Analytics