点击回首页
我的浏览记录 | | 帮助?
当前位置:

源码截图

源码目录树

当前路径:InsApp/InfoSearch/CatchNews.cs     using System;
using System.Collections.Generic;
using System.Text;
using System.Collections;
using System.Net;

using InsApp.log4;

namespace InsApp.InfoSearch
{
    public class CatchNews:Getfile
    {
        Type type = System.Reflection.MethodBase.GetCurrentMethod().DeclaringType;

        private string Txt_WebSiteName, Txt_WebSiteUrl, Txt_WebSiteNews_Url, Txt_WebSiteNews_Memo,
            Txt_News_title, Txt_News_Content, Txt_News_Author, Txt_News_Source, Txt_News_Time,
            Sel_menu1, Sel_menu2, Chk_SaveImage, Chk_Clearjs, Chk_Clearcss;

        private string _MdbPath;                //2007-3-16如果用户使用access数据库,当多线程启用的时候、会出现无法获取数据库的路径
                                                //所以,在多线程启动之前将数据库的路径作为类的参数传入。
        public static string Showit;

        #region MdbPath                      数据库的路径
        public string MdbPath               //数据库的路径
        {
            get { return _MdbPath; }
            set { _MdbPath = value; }
        }
        #endregion

        #region CatchNews   构造函数,共计15个参数,初始化当前类2007-3-16
        /// <summary>
        /// 当前接受新闻采集组的参数,实例化当前类  共计15个参数
        /// </summary>
        public CatchNews(string WebSiteName, string WebSiteUrl, string WebSiteNews_Url, string WebSiteNews_Memo,
        string News_title, string News_Content, string News_Author, string News_Source, string News_Time,
        string menu1, string menu2, string SaveImage, string Clearjs, string Clearcss,string DatabasePath)
        {
            Txt_WebSiteName = WebSiteName;                              //1
            Txt_WebSiteUrl = WebSiteUrl;                                //2
            Txt_WebSiteNews_Url = WebSiteNews_Url;                      //3
            Txt_WebSiteNews_Memo =WebSiteNews_Memo;                    //4
            Txt_News_title = News_title;                                //5
            Txt_News_Content = News_Content;                            //6
            Txt_News_Author = News_Author;                              //7
            Txt_News_Source = News_Source;                              //8
            Txt_News_Time = News_Time;                                  //9
            Sel_menu1 = menu1;                                          //10
            Sel_menu2 = menu2;                                          //11
            Chk_SaveImage = SaveImage;                                  //12
            Chk_Clearjs = Clearjs;                                      //13
            Chk_Clearcss = Clearcss;                                    //14
            MdbPath=DatabasePath;
        }
        #endregion

        #region CatchWebInfo_Thread()  采集过程中使用多线程,测试版 2007-3-1- 12
            public void CatchWebInfo_Thread()
            {
                int CAll = 0;

                InsApp.InfoSearch.Getword MyWord = new Getword();

                /// 1、返回新闻列表的html编码       =LastValue
                /// 2、根据正则表达式,匹配新闻url  =Txt_WebSiteNews_Memo
                /// 3、读取每一个url的内容,
                /// 4、将新闻内容匹配正则表达式,获得标题和内容
                /// 5、如果页面中,标题和内容不能匹配返回错误,停止
                /// 
                Get_WebCode = Get_PageEncode(this.Txt_WebSiteUrl);                                      //返回网页的编码
                string LastValue = this.ReadPage();                                                      //返回新闻列表的html编码

                //根据正则表达式,匹配新闻url
                string[] s_Array = MyWord.Check_url_Array(Txt_WebSiteNews_Memo, LastValue);             //第一个是正则,第二个是文本
                if (s_Array == null)
                {
                    Showit = "无法获取信息,采集停止" + DateTime.Now;
                    return;
                }
                else
                {
                    //格式化获得的超连接数组
                    Format_Url(ref s_Array, Txt_WebSiteNews_Url);
                }

                //将没有重复内容的url读取内容
                //利用正则表达式匹配需要得到的结果

                this.Web_Method = "get";
                this.IsReadContent = "yes";                                       //是否读取内容
                for (int i = 0; i <s_Array.Length; i++)
                {
                    try
                    {
                        this.Get_Url = MyWord.Check_ChineseCode(s_Array[i]);              //编码参数中的中文参数
                        string EveryPageNews_Content = this.ReadPage();                   //得到每一页的新闻内容
                        string title =Filtrate(MyWord.CheckReg(Txt_News_title, EveryPageNews_Content).Replace("'", "‘"));            //新闻标题
                        if (title == "")
                        {
                            continue;
                        }
                        string Content = MyWord.NoHTML(MyWord.Check_url_string(Txt_News_Content, EveryPageNews_Content));          //内容
                        if (Content == "")
                        {
                            continue;
                        }
                        string time = Filtrate(MyWord.Check_url_string(Txt_News_Time, EveryPageNews_Content));             //时间
                        string source =Filtrate(MyWord.Check_url_string(Txt_News_Source, EveryPageNews_Content));           //来源
                        string InsertSql = " Insert into content(C_BigTypeID,C_SmallTypeID,C_Title,C_Content,C_PubTime,C_ViewCount,C_SourceUrl)" +
                        " values ('" + Sel_menu1 + "','" + Sel_menu2 + "','" + title + "','" + Content + "','" + System.DateTime.Now + "','1','" + this.Get_Url + "')";

                        if (GetSqlCmd_bool(InsertSql, MdbPath) == true)
                        {
                            CAll++;
                            Showit += CAll + "、网站:" + Txt_WebSiteName + "    信息:<a href='" + this.Get_Url + "' target='_blank'>" + title + " </a>      成功<br/>";
                        }
                    }
                    catch (Exception ex)
                    {
                        LogUtil.FATAL(type, ex.Message);
                    }
                }

                Showit = "共计" + CAll + "条数据,采集结束。" + DateTime.Now;


            }
            #endregion

    }
}
关于我们 | 顾问团队 | 发展历程 | 联系我们 | 源码上传
联系电话(Tel):4008-010-151(免长途) 企业QQ:4000410510
地址:北京市海淀区中关村鼎好大厦A座二层 邮编:100080
Room A-801,Dinghao Building,Zhongguancun,Beijing,China,100080
51Aspx.com 版权所有 CopyRight © 2006-2015. 京ICP备09089570号 | 京公网安备11010702000869号
在线客服
分享该页面
关闭侧边栏