C#简易采集工具

这段时间比较忙一直在搞CentOS,所以很少碰到编程。但是昨天修改以前写的这个采集工具的时候,不小心把GUI版本改坏了,我也不打算修复好的了,今天就索性把它的核心代码放出来,我相信有很多人需要它。

Program.cs 文件[入口文件]

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;

using System.IO;
using System.Text.RegularExpressions;
using System.Threading;

namespace Gather
{
    class Program
    {
        static void Main(string[] args)
        {
            Console.Title = "简易采集工具 - A Gather Tool";
            Core core = new Core();

            bool menu = true;
            string inifile = null;

            Ini ini = new Ini();
            ini.setfile(inifile);
            string sign = null;
            string page = null;
            string log = null;
            string test = null;

            string suffix = null;
            string encoded = null;

            string page_url = null;
            string test_url = null;

            string start = null;
            string end = null;

            string[] page_rule = new string[5];
            string[] page_data = new string[5];

            string[] log_rule = new string[15];
            string[] log_data = new string[15];

            string[] test_rule = new string[5];
            string[] test_data = new string[15];

            /************************************************************************/
            /* 程序循环                                                             */
            /************************************************************************/

            while (menu)
            {
                core.copy();

                if (inifile == null)
                {
                    Console.Write(" 请按编号输入要执行的操作:\n\n [1] 设置配置文件\n [*] 采集列表页\n [*] 采集文章页\n [*] 采集规则测试\n [0] 退出程序\n\n 编号:");
                
                }
                else
                {
                    Console.Write(" 请按编号输入要执行的操作:\n\n [1] 设置配置文件\n [2] 采集列表页\n [3] 采集文章页\n [4] 采集规则测试\n [0] 退出程序\n\n 编号:");
                }

                try
                {
                    int doit = int.Parse(Console.ReadLine());

                    switch (doit)
                    {
                        /************************************************************/
                        /* 退出                                                     */
                        /************************************************************/
                        case 0:
                            menu = false;
                            break;

                        /************************************************************/
                        /* 设置配置文件                                             */
                        /************************************************************/
                        case 1:

                            bool setini = true;

                            while (setini)
                            {
                                core.copy();

                                Console.Write(" 请输入配置文件的路径,例如:d:\\catch\\fdawn.ini\n 若要使用同目录下的配置文件请直接填写名称,例如:fdawn.ini\n\n 路径:");
                                inifile = Console.ReadLine();

                                if (File.Exists(inifile))
                                {
                                    core.copy();

                                    if (inifile.LastIndexOf(@":\") != 1)
                                    {
                                        inifile = Thread.GetDomain().BaseDirectory + inifile;
                                    }

                                    Console.WriteLine(" 当前设置的配置文件的路径为 {0}", inifile);
                                    Console.ReadKey();
                                    setini = false;

                                    /************************************************/
                                    /* 初始化                                       */
                                    /************************************************/

                                    ini.setfile(inifile);

                                    sign = ini.readini("setting", "sign");
                                    page = ini.readini("setting", "page");
                                    log = ini.readini("setting", "log");
                                    test = ini.readini("setting", "test");

                                    suffix = ini.readini("setting", "suffix");
                                    encoded = ini.readini("setting", "encoded");

                                    page_url = ini.readini("page", "url");
                                    test_url = ini.readini("test", "url");

                                    start = ini.readini("page", "start");
                                    end = ini.readini("page", "end");

                                    page_rule = new string[5];
                                    page_data = new string[5];

                                    log_rule = new string[15];
                                    log_data = new string[15];

                                    test_rule = new string[15];
                                    test_data = new string[15];

                                    for (int i = 0; i < 5; i++)
                                    {
                                        page_rule[i] = ini.readini("page", "rule" + i);
                                        page_data[i] = ini.readini("page", "data" + i);
                                    }

                                    for (int i = 0; i < 15; i++)
                                    {
                                        log_rule[i] = ini.readini("log", "rule" + i);
                                        log_data[i] = ini.readini("log", "data" + i);
                                    }

                                    for (int i = 0; i < 5; i++)
                                    {
                                        test_rule[i] = ini.readini("test", "rule" + i);
                                        test_data[i] = ini.readini("test", "data" + i);
                                    }

                                }
                                else
                                {
                                    core.copy();

                                    Console.WriteLine(" 配置文件不存在!");
                                    Console.ReadKey();
                                }
                            }

                            break;

                        /************************************************************/
                        /* 采集列表页                                               */
                        /************************************************************/
                        case 2:
                            if (inifile == null)
                            {
                                core.copy();

                                Console.WriteLine(" 请先设置配置文件!");
                                Console.ReadKey();
                                break;
                            }
                            else
                            {
                                core.copy();

                                try
                                {
                                    Gather box = new Gather(page_url, sign);

                                    box.page(start, end, page_rule, page_data, page, suffix, encoded);  //调用函数
                                }
                                catch
                                {
                                    core.copy();
                                    Console.WriteLine(" 采集规则出现错误或网络不稳定!");
                                }

                                Console.ReadKey();
                            }

                            break;

                        /************************************************************/
                        /* 采集文章页                                               */
                        /************************************************************/
                        case 3:
                            if (inifile == null)
                            {
                                core.copy();

                                Console.WriteLine(" 请先设置配置文件!");
                                Console.ReadKey();
                                break;
                            }
                            else
                            {
                                core.copy();

                                try
                                {
                                    Gather box = new Gather(page_url, sign);

                                    box.log(log_rule, log_data, page + "-last" + suffix, log, suffix, encoded); //调用函数
                                }
                                catch
                                {
                                    core.copy();
                                    Console.WriteLine(" 采集规则出现错误或网络不稳定!");
                                }

                                Console.ReadKey();
                            }

                            break;

                        /************************************************************/
                        /* 采集规则测试                                             */
                        /************************************************************/
                        case 4:
                            if (inifile == null)
                            {
                                core.copy();

                                Console.WriteLine(" 请先设置配置文件!");
                                Console.ReadKey();
                                break;
                            }
                            else
                            {
                                core.copy();

                                try
                                {
                                    Gather box = new Gather(test_url);

                                    box.test(test_rule, test_data, test, suffix, encoded); //调用函数
                                }
                                catch
                                {
                                    core.copy();
                                    Console.WriteLine(" 采集规则出现错误或网络不稳定!");
                                }

                                Console.ReadKey();
                            }

                            break;

                        /************************************************************/
                        /* 错误提示                                                 */
                        /************************************************************/
                        default:
                            core.copy();

                            Console.WriteLine(" 无效的编号!");
                            Console.ReadKey();
                            break;

                    }
                }
                catch
                {
                    core.copy();

                    Console.WriteLine(" 无效的编号!");
                    Console.ReadKey();
                }
            }

            core.copy();

            Console.WriteLine(" 再见!");
            Console.ReadKey();
        }
    }
}

Ini.cs 文件[Ini类]

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;

using System.Runtime.InteropServices;
using System.IO;

namespace Gather
{
    class Ini
    {
        public string filepath;
        [DllImport("kernel32")]
        private static extern long WritePrivateProfileString(string section, string key, string val, string filePath);
        [DllImport("kernel32")]
        private static extern int GetPrivateProfileString(string section, string key, string def, StringBuilder retval, int size, string filePath);

        public void setfile(string filepath)
        {
            this.filepath = filepath;
        }

        public void writeini(string section, string key, string value)
        {
            WritePrivateProfileString(section, key, value, filepath);
        }

        public string readini(string section, string key)
        {
            StringBuilder temp = new StringBuilder(500);
            int i = GetPrivateProfileString(section, key, "", temp, 500, filepath);
            return temp.ToString();

        }
    }
}

Gather.cs 文件[采集类]

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;

using System.Text.RegularExpressions;
using System.IO;
using System.Net;
using System.Collections;

namespace Gather
{
    class Gather:Core
    {
        private string page_url, test_url, sign;

        public Gather(string page_url, string sign)
        {
            this.page_url = page_url;
            this.sign = sign;
        }

        public Gather(string test_url)
        {
            this.test_url = test_url;
        }

        private void clearfile(string filename)
        {
            StreamWriter sw = new StreamWriter(filename);
            sw.Write("");
            sw.Close();
        }

        private void writefile(string str, string filename)
        {
            StreamWriter sw = File.AppendText(filename);
            sw.Write(str);
            sw.Close();
        }

        private string[] readfile(string filename, string encode)
        {
            StreamReader sr = new StreamReader(filename, Encoding.GetEncoding(encode)); //Encoding.Default
            string line;
            ArrayList list = new ArrayList();

            while ((line = sr.ReadLine()) != null)
            {
                list.Add(line.ToString());
            }

            string[] array = new string[list.Count];
            for (int i = 0; i < list.Count; i++)
            {
                array[i] = list[i].ToString();
            }

            sr.Close();
            return array;
        }

        public MatchCollection initialise(string regular, string url, string encode, string page, string sign)
        {
            string realurl = url.Replace(sign, page);
            string box = link(realurl, encode);
            return replace(regular, box);
        }

        /************************************************************************/
        /* 初始化用于test                                                       */
        /************************************************************************/
        public MatchCollection initialise(string regular, string url, string encode)
        {
            string box = link(url, encode);
            return replace(regular, box);
        }

        private MatchCollection replace(string regular, string main)
        {
            Regex catch_main = new Regex(regular, RegexOptions.IgnoreCase);
            MatchCollection main_array = catch_main.Matches(main);

            return main_array;
        }

        private string link(string url, string encode)
        {
            string strMsg = string.Empty;
            try
            {
                WebRequest request = WebRequest.Create(url);
                WebResponse response = request.GetResponse();
                StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding(encode));

                strMsg = reader.ReadToEnd();

                reader.Close();
                reader.Dispose();
                response.Close();
            }
            catch
            {

            }

            return strMsg;
        }

        public void page(string start, string end, string[] page_rule, string[] page_data, string prefix, string suffix, string encode)
        {
            Console.WriteLine(" 正在执行采集任务!请等待...");

            int file_max = 0, rule_max = 0;

            for (int num = int.Parse(start); num < int.Parse(end) + 1; num++)
            {
                MatchCollection[] page_data_box = new MatchCollection[5];
                page_data_box[0] = this.initialise(page_rule[0], page_url, encode, num.ToString(), sign);

                /************************************************************************/
                /* 抓取并生成缓存                                                       */
                /************************************************************************/

                bool goon = true;
                this.clearfile(prefix + "-" + num + "-0" + suffix);

                while (goon)
                {
                    string empty = null;

                    for (int j = 0; j < page_data_box[0].Count; j++)
                    {
                        empty += page_data_box[0][j].Value;
                    }

                    if (empty.Length > 5)
                    {
                        for (int j = 0; j < page_data_box[0].Count; j++)
                        {
                            this.writefile(page_data_box[0][j].Value, prefix + "-" + num + "-0" + suffix);
                        }

                        goon = false;
                    }
                }

                /************************************************************************/
                /* 使用缓存与规则                                                       */
                /************************************************************************/

                for (int i = 1; i < 5; i++)
                {
                    if (page_rule[i] != "NULL" && page_rule[i] != null && page_data[i] != "NULL" && page_data[i] != null)
                    {
                        this.clearfile(prefix + "-" + num + "-" + i + suffix);

                        string[] array = this.readfile(prefix + "-" + num + "-" + page_data[i] + suffix, encode);
                        string tmp = "";

                        foreach (string str in array)
                        {
                            tmp += str;
                        }

                        page_data_box[i] = this.replace(page_rule[i], tmp);

                        for (int j = 0; j < page_data_box[i].Count; j++)
                        {
                            this.writefile(page_data_box[i][j].Value + "\n", prefix + "-" + num + "-" + i + suffix);
                        }

                        rule_max ++;   //获取最大的规则数
                    }
                }

                file_max ++; //获取最大的文件数
            }

            /************************************************************************/
            /* 合并使用处理后的内容                                                 */
            /************************************************************************/

            this.clearfile(prefix + "-last" + suffix);
            for (int i = 1; i < file_max + 1; i++)
            {
                string[] array = this.readfile(prefix + "-" + i + "-" + (rule_max / file_max) + suffix, encode);

                for (int j = 0; j < array.Length; j++)
                {
                    this.writefile(array[j] + "\n", prefix + "-last" + suffix);
                }

            }

            this.copy();
            Console.WriteLine(" 采集任务已执行完成!");
            Console.ReadKey();
        }

        public void log(string[] log_rule, string[] log_data, string filename, string prefix, string suffix, string encode)
        {
            Console.WriteLine(" 正在执行采集任务!请等待...");

            MatchCollection[] log_data_box = new MatchCollection[15];

            string[] log_url = this.readfile(filename, encode);

            /************************************************************************/
            /* 抓取并生成缓存                                                       */
            /************************************************************************/

            for (int i = 0; i < log_url.Length; i++)
            {
                bool goon = true;
                this.clearfile(prefix + "-" + i + "-0" + suffix);

                while (goon)
                {
                    string tmp = this.link(log_url[i], encode);

                    if (tmp.Length > 5)
                    {
                        this.writefile(tmp, prefix + "-" + i + "-0" + suffix);
                        goon = false;

                        tmp = null;
                    }
                }
            }

            /************************************************************************/
            /* 使用缓存与规则                                                       */
            /************************************************************************/

            for (int num = 0; num < log_url.Length; num++)
            {
                for (int i = 1; i < 15; i++)
                {
                    if (log_rule[i] != "NULL" && log_rule[i] != null && log_data[i] != "NULL" && log_data[i] != null)
                    {
                        this.clearfile(prefix + "-" + num + "-" + i + suffix);

                        string[] array = this.readfile(prefix + "-" + num + "-" + log_data[i] + suffix, encode);
                        string tmp = "";

                        foreach (string str in array)
                        {
                            tmp += str;
                        }

                        log_data_box[i] = this.replace(log_rule[i], tmp);

                        for (int j = 0; j < log_data_box[i].Count; j++)
                        {
                            this.writefile(log_data_box[i][j].Value + "\n", prefix + "-" + num + "-" + i + suffix);
                        }
                    }
                }
            }

            this.copy();
            Console.WriteLine(" 采集任务已执行完成!");
            Console.ReadKey();
        }

        public void test(string[] test_rule, string[] test_data, string prefix, string suffix, string encode)
        {
            this.copy();
            Console.WriteLine(" 正在执行采集任务!请等待...");

            MatchCollection[] test_data_box = new MatchCollection[5];
            test_data_box[0] = this.initialise(test_rule[0], test_url, encode);

            /************************************************************************/
            /* 抓取并生成缓存                                                       */
            /************************************************************************/
            bool goon = true;
            this.clearfile(prefix + "-0" + suffix);

            while (goon)
            {
                string empty = null;

                for (int j = 0; j < test_data_box[0].Count; j++)
                {
                    empty += test_data_box[0][j].Value;
                }

                if (empty.Length > 5)
                {
                    for (int j = 0; j < test_data_box[0].Count; j++)
                    {
                        this.writefile(test_data_box[0][j].Value, prefix + "-0" + suffix);
                    }

                    goon = false;
                }
            }

            /************************************************************************/
            /* 使用缓存与规则                                                       */
            /************************************************************************/

            for (int i = 1; i < 5; i++)
            {
                if (test_rule[i] != "NULL" && test_rule[i] != null && test_data[i] != "NULL" && test_data[i] != null)
                {
                    this.clearfile(prefix + "-" + i + suffix);

                    string[] array = this.readfile(prefix + "-" + test_data[i] + suffix, encode);
                    string tmp = "";

                    foreach (string str in array)
                    {
                        tmp += str;
                    }

                    test_data_box[i] = this.replace(test_rule[i], tmp);

                    for (int j = 0; j < test_data_box[i].Count; j++)
                    {
                        this.writefile(test_data_box[i][j].Value + "\n", prefix + "-" + i + suffix);
                    }
                }
            }

            this.copy();
            Console.WriteLine(" 采集任务已执行完成!");
            Console.ReadKey();
        }
    }
}

Core.cs 文件[核心类]

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net;

namespace Gather
{
    class Core
    {
        public void copy()
        {
            Console.Clear();

            Console.WriteLine(" --------------------------------------------------------------------------");
            Console.WriteLine(" Module: Gather Tool");
            Console.WriteLine(" Author: _Gemini.Ki");
            Console.WriteLine(" Warning:    Ini file need ANSI encoded");
            Console.WriteLine(" --------------------------------------------------------------------------");
        }
    }
}

[如何扩展核心功能]

如要扩展功能,可以在Core.cs文件里面添加。

fdawn.ini 文件[配置文件]

;--------------------------------------------------------------------------
; Module:   Gather Tool
; Author:   _Gemini.Ki
; Warning:  This Ini file need ANSI encoded
;--------------------------------------------------------------------------

;全局设置 不使用的属性值请设置为“NULL”
[setting]
 ;通配符
 sign   = [{*}]
 ;列表页数据保存的文件名前缀
 page   = D:\Gather\page\page
 ;文章页数据保存的文件名前缀
 log        = D:\Gather\log\log
 ;测试页数据保存的文件名
 test   = D:\Gather\test
 
 ;数据保存的文件后缀
 suffix = .txt
 ;编码
 encoded    = utf-8

;列表页设置 从0开始,最大规则数为5,超出无效!
[page]
 ;列表页URL
 url        = http://www.nowamagic.net/librarys/veda/channel/ProgrammingLanguage/[{*}]/
 ;列表页采集开始页数
 start  = 1
 ;列表页采集结束页数
 end        = 3

 ;规则 rule0为匹配大标签,大标签必须包含其他规则的内容,否则结果为空!
 rule0  = <div id="left_content">([\d\D]*?)(?=<div id="sidebar">)
 rule1  = (?<=<div class="post_content">)([\d\D]*?)(?=</div>)
 rule2  = (?<=href=")([\d\D]*?)(?=")
 rule3  = NULL
 rule4  = NULL

 ;  数据源 例如data1 = 1就是rule1使用rule0采集到的数据继续匹配
 data0  = 0
 data1  = 0
 data2  = 1
 data3  = NULL
 data4  = NULL

;文章页设置 从0开始,最大规则数为15,超出无效!
[log]
 
 ;规则 rule0为匹配大标签,大标签必须包含其他规则的内容,否则结果为空!
 rule0  = (?<=<div class="fullbox_content">)([\d\D]*?)(?=<div class="fullbox_footer"></div>)
 rule1  = (?<=rel="bookmark">)([\d\D]*?)(?=</a>)
 rule2  = (?<=<div class="post_info_left">)([\d\D]*?)(?=</div>)
 rule3  = (\d{2,4}(.*?)){3}(?= )
 rule4  = (?<=<div class="post_content readmood" id="defend_\d+">)([\d\D]*?)(?=<div class="fav_area">)
 rule5  = NULL
 rule6  = NULL
 rule7  = NULL
 rule8  = NULL
 rule9  = NULL
 rule10 = NULL
 rule11 = NULL
 rule12 = NULL
 rule13 = NULL
 rule14 = NULL

 ;数据源
 data0  = 0
 data1  = 0
 data2  = 0
 data3  = 2
 data4  = 0
 data5  = NULL
 data6  = NULL
 data7  = NULL
 data8  = NULL
 data9  = NULL
 data10 = NULL
 data11 = NULL
 data12 = NULL
 data13 = NULL
 data14 = NULL

;采集规则测试设置
[test]
 ;列表页URL
 url        = http://www.nowamagic.net/librarys/veda/channel/ProgrammingLanguage/1/

 ;规则 rule0为匹配大标签,大标签必须包含其他规则的内容,否则结果为空!
 rule0  = <div id="left_content">([\d\D]*?)(?=<div id="sidebar">)
 rule1  = (?<=<div class="post_content">)([\d\D]*?)(?=</div>)
 rule2  = (?<=href=")([\d\D]*?)(?=")
 rule3  = NULL
 rule4  = NULL

 ;  数据源 例如data1 = 1就是rule1使用rule0采集到的数据继续匹配
 data0  = 0
 data1  = 0
 data2  = 1
 data3  = NULL
 data4  = NULL

[一直停留在正在采集]

若采集时一直卡在正在采集,很可能是采集规则错误引起的无限循环匹配,卡住请强制关闭程序修改好规则再执行!

[乱码解决方法]

把配置文件保存成ANSI编码,而且要在配置文件设置采集页面的编码!

标签:工具, c#, 简易, 采集

该文章由 Shiqi Qiu 原创并发布在 被遗忘的曙光 技术博客

转载请标明来源:http://blog.fdawn.com/C-Sharp/15.html

添加新评论