点击回首页
我的浏览记录 | | 帮助?
当前位置:

源码截图

源码目录树

当前路径:InsApp/InfoSearch/Getfile.cs     using System;
using System.Collections;
using System.Collections.Generic;
using System.Text;
using System.Net;
using System.IO;
using System.Net.Mime;
using InsApp.log4;
using System.Web;
using System.Data.OleDb;
using System.Text.RegularExpressions;

namespace InsApp.InfoSearch
{ 

    /// 对远程文件进行抓取
    /// 是采集新闻和房产信息的父类
    ///string[] Format_Url(string[] URL_Array,string FormatUrl)过滤一些非法字符格式化从页面中获得的超级链接
    
    public class Getfile
    {


        private string _Get_Url;                //网页url
        private string _Get_WebCode;            //网页编码
        private string _Get_Method;             //网页事件
        private string _IsReadContent;          //是否需要读取页面的内容,还是只判断页面是否存在

        Type type = System.Reflection.MethodBase.GetCurrentMethod().DeclaringType;

        #region Getfile   构造函数,初始化当前类2007-3-1
        public Getfile() { }
        #endregion

        #region Get_Url   网页url
        public string Get_Url               //网页url
        {
            get { return _Get_Url; }
            set { _Get_Url = value; }
        }
        #endregion

        #region Get_WebCode 得到网页编码
        public string Get_WebCode           //网页编码
        {
            get { return _Get_WebCode; }
            set { _Get_WebCode = value; }
        }
        #endregion

        #region IsReadContent  是否需要读取页面的内容yes,还是只判断页面是否存在no
        public string IsReadContent               //是否需要读取页面的内容yes,还是只判断页面是否存在no
        {
            get
            {
                if (_IsReadContent == null || _IsReadContent == "")
                {
                    return "no";
                }
                else
                {
                    return _IsReadContent;
                }
            }
            set { _IsReadContent = value; }
        }
        #endregion

        #region 当前网页事件  post  get默认get
        public string Web_Method            //当前网页事件  post  get
        {
            get
            {
                if (_Get_Method == null || _Get_Method == "")
                {
                    _Get_Method="get";
                }
                else
                {
                    switch (_Get_Method.ToLower())
                    {
                        case ("post"):
                            _Get_Method = "post";
                            break;
                        case ("get"):
                            _Get_Method = "get";
                            break;
                        default:
                            _Get_Method = "get";
                            break;
                    }  
                }
                
                return _Get_Method; 
                }
                set { _Get_Method = value; }
             }


                #endregion

        #region ReadWebPage调用ReadPage 判断网页是否存在
        public void ReadWebPage()
        {
            ReadPage();
        }
        #endregion


        #region ReadPage()判断网页是否存在,并且根据页面编码读取该页面的内容
        /// <summary>
        /// 判断网页是否存在,并且根据页面编码读取该页面的内容
        /// </summary>
        /// <param name="Pageurl">网页路径</param>
        /// <param name="Getcode">Getcode	得到稳当的编码	默认是gb2312 或者utf-8 其它;</param>
        /// <param name="getWebInfo">string Returnstr	最终的值赋给Cookies</param>
        /// <returns></returns>
        /// public string ReadPage()
        public string ReadPage()
        {
            string getWebInfo;  //获得页面的内容             
            HttpWebRequest myWebRequest;
            if (Get_Url == "" || Get_Url == null)
            {
                return "bad";
            }
            try
            {
            //	如果传入的网页url错误;
                myWebRequest = (HttpWebRequest)WebRequest.Create(Get_Url);
            }
            catch(Exception ex)
            {
                LogUtil.ERROR(type, ex.Message);
                return "bad";
            }

            myWebRequest.Method = Web_Method;
            myWebRequest.ContentType = "application/x-www-form-urlencoded";
            myWebRequest.Accept = "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-powerpoint, application/vnd.ms-excel, application/msword, */*";
            myWebRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)";
            myWebRequest.KeepAlive = false;

            try
            {
                HttpWebResponse res = (HttpWebResponse)myWebRequest.GetResponse();
                if (res.StatusCode == HttpStatusCode.OK && this.IsReadContent=="yes")		//如果返回的信息表明网页存在,并且需要返回页面内容
                {
                    //		读取网页内容;
                    StreamReader readerWeb = new StreamReader(res.GetResponseStream(), System.Text.Encoding.GetEncoding(Get_WebCode));
                    //		getWebInfo网页内容
                    getWebInfo = readerWeb.ReadToEnd();
                    readerWeb.Close();
                    res.Close();
                    return getWebInfo;
                }
                else if (res.StatusCode == HttpStatusCode.OK && this.IsReadContent == "no")//如果返回的信息表明网页存在,但是不需要返回页面内容
                {
                    res.Close();
                    return res.StatusCode.ToString();   //返回页面状态
                }
                else
                {
                    res.Close();
                    return res.StatusCode.ToString();
                }
            }
            catch (Exception ex)
            {
                LogUtil.ERROR(type, ex.Message);
                return ex.ToString();
            }

        }
        #endregion


        #region 读取远程网页,判断并返回编码
        /// <summary>
        /// 读取远程网页,判断并返回编码
        /// </summary>
        /// <param name="Pageurl"></param>
        /// <returns></returns>
        public string Get_PageEncode(string Get_Url)
        {
            string getWebInfo;
            Getword Tc = new Getword();
            HttpWebRequest myWebRequest;
            if (Tc.CheckNullstr(Get_Url) == false)
            {
                return "bad";
            }
            try
            {
                //	如果传入的网页url错误;
                myWebRequest = (HttpWebRequest)WebRequest.Create(Get_Url);
            }
            catch(Exception ex)
            {
                LogUtil.ERROR(type, ex.Message);
                return "bad";
            }

            myWebRequest.Method = "GET";
            myWebRequest.ContentType = "application/x-www-form-urlencoded";
            myWebRequest.Accept = "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-powerpoint, application/vnd.ms-excel, application/msword, */*";
            myWebRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)";
            myWebRequest.KeepAlive = false;

            try
            {
                HttpWebResponse res = (HttpWebResponse)myWebRequest.GetResponse();
                if (res.StatusCode == HttpStatusCode.OK)		//如果返回的信息表明网页存在的话
                {
                    //		读取网页内容;
                    StreamReader readerWeb = new StreamReader(res.GetResponseStream(), System.Text.Encoding.ASCII);
                    //		getWebInfo网页内容
                    getWebInfo = readerWeb.ReadToEnd();
                    readerWeb.Close();
                    res.Close();
                    string Treg = "charset\\=(\\w[^<]*)\"";
                    string _TheEncode = Tc.CheckReg(Treg, getWebInfo);//读取网页的编码
                    _TheEncode.Replace("\"", "");
                    return _TheEncode;                                    //网页的编码
                }
                else
                {
                    res.Close();
                    return "null";
                }
            }
            catch(Exception ex) 
            {
                LogUtil.ERROR(type, ex.Message);
                return "utf-8";
            }
        }
        #endregion


        #region ReadFile(string 文件路径)  如果服务器端的文件存在的话,读取该文件内容
        /// <summary>
        ///		如果文件存在的话,读取该文件内容
        /// </summary>
        /// <param name="FilePath"></param>
        public string ReadFile(string FilePath)
        {
            string Getvalue = "";
            string Readit = "";
            if (!File.Exists(FilePath))
            {
                throw new Exception("文件不存在");
            }
            StreamReader sr = new StreamReader(FilePath);
            while ((Readit = sr.ReadLine()) != null)
            {
                Getvalue += Readit;
            }
            sr.Close();
            return Getvalue;

        }
        #endregion

       
#region   接收sql语句and 数据库信息,返回bool结果
/// <summary>
/// 接收sql语句,返回bool结果
/// </summary>
/// <param name="Db_Sql">接收sql语句</param>
/// <returns></returns>
protected bool GetSqlCmd_bool(string Db_Sql,string System_MdbInfo)
{
    //	创建数据库联接
    OleDbConnection Conn = new OleDbConnection(System_MdbInfo);
    try
    {
        Conn.Open();
        OleDbCommand myCmd = new OleDbCommand(Db_Sql, Conn);
        myCmd.ExecuteNonQuery();
        myCmd.Dispose();
        Conn.Close();
        return true;
    }
    catch (Exception ex)
    {
        LogUtil.ERROR(type, ex.Message);
        return false;
    }
}
#endregion

#region  Filtrate(string Getpara) 过滤一些非法字符
/// <summary>
///  根据参数
///  过滤一些非法字符
///</summary>
/// <param name="Getpara"></param>
        protected string Filtrate(string Getpara)
        {
            if (Getpara == null || Getpara == "")
            {
                return Getpara="";
            }
            else
            {
                Getpara = Getpara.Replace("'", "‘");
                Getpara = Regex.Replace(Getpara, @"<.*?>", "", RegexOptions.IgnoreCase);//      清除所有标签
                return Getpara = Regex.Replace(Getpara, @"&(.+);", "", RegexOptions.IgnoreCase).Trim();
            }
        }
#endregion

        #region  string[] Format_Url(string[] URL_Array,string FormatUrl)过滤一些非法字符格式化从页面中获得的超级链接
        /// <summary>
        /// 格式化从页面中获得的超级链接
        /// URL_Array   存储在数组里面的超级链接
        /// FormatUrl   根据该连接格式化
        /// 最终返回格式化的超连接数组
        /// </summary>
        protected string[] Format_Url(ref string[] URL_Array,string FormatUrl)
        {
            ///http://xxx.xxx.xxx/xxxxxx/xxx/xxx.aspx?dds=xxx
            ///获得标准url中第3个/之前的内容,替换../的内容

            ///前7个字符固定http://
            ///获得标准url中第3个/之前的内容,替换../的内容

            string ab = FormatUrl.Replace("http://", "");

            int c = ab.IndexOf("/");                            //   第一次的匹配位置
            string Url1 = "http://" + ab.Substring(0, c + 1);   //  替换出来http://xxx.xxx.com/  最后带/


            int b = FormatUrl.LastIndexOf("/");                 //  找到“/”最后一次的匹配位置
            ///获取从右数第一个“/”开始到左面的所有的内容,替换不是以http开头的数据
            string Url2 = FormatUrl.Substring(0, b + 1);

            string Url3 = Url1.Substring(0, Url1.Length - 1);


            for (int i = 0; i < URL_Array.Length; i++)
            {
                if (URL_Array[i].Length > 8)
                {
                    if (URL_Array[i].IndexOf("../", 0, 3) == 0)                 //字符串以../开头
                    {
                        URL_Array[i] = URL_Array[i].Replace("../", Url1);
                    }
                    if (URL_Array[i].IndexOf("/", 0, 1) >-1)           //如果url以/开头
                    {
                        URL_Array[i] = Url3 + URL_Array[i];
                    }
                    if (URL_Array[i].IndexOf("http://", 0, 7) == -1)            //如果连接不是以http://开头
                    {
                        URL_Array[i] = Url2 + URL_Array[i];
                    }

                }

            }

            /// 定义一个ArrayList,清除所有重复的url
            /// URL长度大于49,表示内容合格
            /// 所有的url必须以http://开头,没有的话就添上,如果以../开头也要匹配   Txt_WebSiteUrl
            ArrayList list = new ArrayList();

            foreach (string s in URL_Array)
            {

                if (s.IndexOf("#", 0) == -1 && s.Length > 39 && !list.Contains(s))
                {
                    list.Add(s);
                }
            }

            //再将ArrayList转成字符串数组.
            return URL_Array = (string[])list.ToArray(typeof(string));

        }
        #endregion
    }


}
关于我们 | 顾问团队 | 发展历程 | 联系我们 | 源码上传
联系电话(Tel):4008-010-151(免长途) 企业QQ:4000410510
地址:北京市海淀区中关村鼎好大厦A座二层 邮编:100080
Room A-801,Dinghao Building,Zhongguancun,Beijing,China,100080
51Aspx.com 版权所有 CopyRight © 2006-2015. 京ICP备09089570号 | 京公网安备11010702000869号
在线客服
分享该页面
关闭侧边栏