/*
.net/c#: 利用反射编写通用的 rss 2.0 的 reader
最近在写一个 simple rss reader
网上找到现成代码两种:
1.代码简单的,但不够通用 (如: 本站的一些专用 rss reader)
2.代码复杂的,但没有足够时间去消化 (如: rssbandit)
遂自己动手:
由于 rss 的基本属性大家都有!
但一些特殊不通用属性,如:
slash:comments
wfw:comment
wfw:commentrss
trackbackping
不一定存在! 如何处理???
我想到了 reflection,就此提出以下解决方案:
1. class rssheader 用于表示 rss 的头信息
你可以在为其添加新属性,原则是:
成员变量 fieild 的名称为 rss 的 xml 源对应的属性名称前加下划线,xml 属性名称含有 ":" 将其滤掉!
如: <dc:language>zh-chs</dc:language>
将其影射为:
private string _dclanguage
public string dclanguage
{
get
{
return this._dclanguage;
}
}
2. class rssitem 用于表示 rss 的 item
添加新属性的原则同 rssheader!
3. 获取 rss 的 xml 源后通过递归遍历节点 (class simplerssreader)
根据实际存在的 rss 属性,通过反射,"构造实例化" rssheader 和 rssitem!
请仔细参阅 class simplerssreader 的 travel 方法!
4. 数据库 (本文使用了 micrshaoft data access application block 3.1)
表:
channels (主表)
channelsdetails (细表)
字段名称及其数据类型严格按照 rss 的 xml 源对应的属性名称,xml 属性名称含有 ":" 将其滤掉!
存储过程:
sp_addchannel
sp_addchannelsdetails
参数名称及其数据类型严格按照 rss 的 xml 源对应的属性名称,xml 属性名称含有 ":" 将其滤掉!
命令行编译:
csc simplersreader.cs /r:c:\windows\microsoft.net\framework\v1.1.4322\system.data.oracleclient.dll
全部代码 simplerssreader.cs 在此下载
http://www.cnblogs.com/files/microshaoft/simplerssreader.rar
*/
namespace microshaoft
{
using system;
using system.xml;
using system.text;
using system.reflection;
using system.collections;
using system.text.regularexpressions;
全部代码 simplerssreader.cs 在此下载
http://www.cnblogs.com/files/microshaoft/simplerssreader.rar
*/
namespace microshaoft
{
using system;
using system.xml;
using system.text;
using system.reflection;
using system.collections;
using system.text.regularexpressions;
public class rssheader
{
//feed url
public rssheader(string url)
{
this._url = url;
}
public string title
{
get
{
return this._title;
}
}
public string description
{
get
{
return this._description;
}
}
public string link
{
get
{
return this._link;
}
}
public string language
{
get
{
return this._language;
}
}
public string generator
{
get
{
return this._generator;
}
}
public string ttl
{
get
{
return this._ttl;
}
}
public string copyright
{
get
{
return this._copyright;
}
}
public datetime pubdate
{
get
{
return util.parsedatetime(this._pubdate);
}
}
public string category
{
get
{
return this._category;
}
}
public datetime lastbuilddate
{
get
{
return util.parsedatetime(this._lastbuilddate);
}
}
public string managingeditor
{
get
{
return this._managingeditor;
}
}
public string url
{
get
{
return this._url;
}
}
public string dclanguage
{
get
{
return this._dclanguage;
}
}
//下面私有 field 的值将 class simplerssreader 中通过反射赋值
private string _dclanguage; //dc:language
private string _url;
private string _managingeditor;
private string _lastbuilddate;
private string _title;
private string _description;
private string _link;
private string _language;
private string _generator;
private string _ttl;
private string _copyright;
private string _pubdate;
private string _category;
}
public class rssitem
{
private rssheader _header;
public rssheader header
{
get
{
return this._header;
}
}
//下面私有 field 的值将 class simplerssreader 中通过反射赋值
private string _title;
private string _link;
private string _description;
private string _category;
private string _author;
private string _pubdate;
private string _comments;
private string _guid;
private string _slashcomments;
private string _wfwcomment;
private string _wfwcommentrss;
private string _trackbackping;
public string trackbackping
{
get
{
return this._trackbackping;
}
}
public string wfwcommentrss
{
get
{
return this._wfwcommentrss;
}
}
public string wfwcomment
{
get
{
return this._wfwcomment;
}
}
public string slashcomments
{
get
{
return this._slashcomments;
}
}
public string title
{
get
{
return this._title;
}
}
public string link
{
get
{
return this._link;
}
}
public string description
{
get
{
return this._description;
}
}
public string category
{
get
{
return this._category;
}
}
public string author
{
get
{
return this._author;
}
}
public datetime pubdate
{
get
{
return util.parsedatetime(this._pubdate);
}
}
public string comments
{
get
{
return this._comments;
}
}
public string guid
{
get
{
return this._guid;
}
}
}
public class simplerssreader
{
//rssheader header 解析处理完毕事件
public delegate void rssheaderreceiveeventhandler(simplerssreader sender, rssheader header);
public event rssheaderreceiveeventhandler rssheaderreceive;
//某一个 rssitem 解析处理完毕事件
public delegate void rssitemreceiveeventhandler(simplerssreader sender, rssitem item);
public event rssitemreceiveeventhandler rssitemreceive;
private type _trs; //typeof(rssheader)
private type _tri; //typeof(rssitem)
private arraylist _rssitemsal;
private rssheader _rs;
public rssheader rssheader
{
get
{
return this._rs;
}
}
//用于存储所有的 rssitem
private rssitem[] _rssitems;
public rssitem[] rssitems
{
get
{
return this._rssitems;
}
}
public void rss(string url)
{
xmldocument xd = new xmldocument();
//如果效率不高可采用 webrequest 替代
xd.load(url);
xmlnodelist xnl = xd.selectnodes("/rss/channel");
this._rs = new rssheader(url);
this._trs = typeof(rssheader);
this._tri = typeof(rssitem);
this._rssitemsal = new arraylist();
foreach (xmlnode xn in xnl)
{
//递归遍历
this.travel(xn, 0);
}
if (this._rssitemsal.count > 0)
{
this._rssitems = new rssitem[this._rssitemsal.count];
int i = 0;
foreach (object o in this._rssitemsal)
{
this._rssitems[i++] = (rssitem) o;
}
}
}
/// <header>
/// 递归遍历
/// </header>
/// <param name="xn">节点</param>
/// <param name="i">项目数</param>
private void travel(xmlnode xn, int i)
{
if (xn.haschildnodes)
{
foreach (xmlnode x in xn.childnodes)
{
if (x.parentnode != null)
{
if (x.parentnode.name == "channel")
{
if (x.name == "item")
{
i ++;
if (i >= 1)
{
xmlnode node = null;
bool b = false; //是否是 rss item
rssitem ri = null;
if (i == 1) //header
{
node = xn;
b = false;
}
else if (i > 1) //item
{
node = x;
b = true;
ri = new rssitem();
}
foreach (xmlnode n in node.childnodes)
{
if (n.name != "item")
{
if (!b) //rss header header
{
//根据 xml 实际存在的属性,利用反射为 rssheader 实例的私有成员赋值
fieldinfo fi = this._trs.getfield("_" + n.name.replace(":","") ,bindingflags.nonpublic | bindingflags.instance | bindingflags.public);
if (fi != null)
{
fi.setvalue(this._rs,n.innertext);
}
}
else //rss item
{
//根据 xml 实际存在的属性,利用反射为 rssitem 实例的私有成员赋值
fieldinfo fi = this._tri.getfield("_" + n.name.replace(":",""),bindingflags.nonpublic | bindingflags.instance | bindingflags.public);
if (fi != null)
{
fi.setvalue(ri,n.innertext);
}
}
}
}
if (!b)
{
//触发 rssheaderreceive 事件
if (this.rssheaderreceive != null)
{
this.rssheaderreceive(this,this._rs);
}
}
else
{
//制定 rssitem 实例的 header/header
fieldinfo fi = this._tri.getfield("_header",bindingflags.nonpublic | bindingflags.instance | bindingflags.public);
if (fi != null)
{
fi.setvalue(ri,this._rs);
}
//触发 rssitemreceive 事件
if (this.rssitemreceive != null)
{
this.rssitemreceive(this,ri);
}
this._rssitemsal.add(ri);
}
}
}
}
}
if (!x.haschildnodes)
{
this.travel(x, i);
}
}
}
}
}
public class util
{
public static datetime parsedatetime(string s)
{
datetime dt;
if (s == null || s.tostring().length <= 0)
{
dt = datetime.now;
}
else
{
try
{
dt = datetime.parse(s);
}
catch
{
dt = datetime.now;
}
}
return dt;
}
/// <header>
/// 去除 html tag
/// </header>
/// <param name="html">源</param>
/// <returns>结果</returns>
public static string striphtml(string html) //google "striphtml" 得到
{
string[] regexs =
{
@"<script[^>]*?>.*?</script>",
@"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""])(\\[""tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>",
@"([\r\n])[\s]+",
@"&(quot|#34);",
@"&(amp|#38);",
@"&(lt|#60);",
@"&(gt|#62);",
@"&(nbsp|#160);",
@"&(iexcl|#161);",
@"&(cent|#162);",
@"&(pound|#163);",
@"&(copy|#169);",
@"&#(\d+);",
@"–>",
@"<!–.*\n"
};
string[] replaces =
{
"",
"",
"",
"\"",
"&",
"<",
">",
" ",
"\xa1", //chr(161),
"\xa2", //chr(162),
"\xa3", //chr(163),
"\xa9", //chr(169),
"",
"\r\n",
""
};
string s = html;
for (int i = 0; i < regexs.length; i++)
{
s = new regex(regexs[i], regexoptions.multiline | regexoptions.ignorecase).replace(s, replaces[i]);
}
s.replace("<", "");
s.replace(">", "");
s.replace("\r\n", "");
return s;
}
}
}
//测试程序
namespace test
{
using system;
using system.data;
using system.reflection;
using system.data.sqlclient;
using microshaoft;
using microshaoft.data;
class consoleapplication
{
private sqlconnection _connection;
public string _channel;
public sqlconnection connection
{
set
{
this._connection = value;
}
get
{
return this._connection;
}
}
static void main()
{
string s = "http://www.ccw.com.cn/rss/news2/1.xml";
s = "http://dzh.mop.com/topic/rss.jsp?type=28";
s = "http://www.ccw.com.cn/rss/news2/15.xml";
s = "http://www.cnblogs.com/rss.aspx?id=-1";
s = "http://localhost/rss.xml";
//s = "http://weblog.siliconvalley.com/column/dangillmor/index.xml";
//s= "http://www.skyone.com.cn/sub/rss/list_jjsc.xml";
consoleapplication a = new consoleapplication();
a.connection = new sqlconnection("server=server\\psqlke;user id=sa;password=;database=rss");
a.connection.open();
simplerssreader srr = new simplerssreader();
srr.rssheaderreceive += new microshaoft.simplerssreader.rssheaderreceiveeventhandler(a.srr_rssheaderreceive);
srr.rssitemreceive +=new microshaoft.simplerssreader.rssitemreceiveeventhandler(a.srr_rssitemreceive);
system.console.writeline("waiting ….");
srr.rss(s); //以后改成多线程或异步
system.console.writeline("print all rss header and items ….");
system.console.readline();
system.console.writeline("header: "+ srr.rssheader.title);
foreach (rssitem ri in srr.rssitems)
{
system.console.writeline("item: " + ri.title);
}
system.console.readline();
}
private void srr_rssheaderreceive(simplerssreader sender, rssheader header)
{
system.console.writeline("header:" + header.link);
system.console.writeline("header:" + header.title);
this.savetodatabase("sp_addchannel",typeof(rssheader),header);
}
private void srr_rssitemreceive(simplerssreader sender, rssitem item)
{
system.console.writeline("item: " + item.title);
system.console.writeline("item: " + item.link);
system.console.writeline("item: " + util.striphtml(item.description));
this.savetodatabase("sp_addchannelsdetails",typeof(rssitem),item);
}
private void savetodatabase(string sp, type t,object instance)
{
//获取 sp 所有参数
sqlparameter[] spa = sqlhelperparametercache.getspparameterset(this.connection, sp);
system.collections.hashtable ht = new system.collections.hashtable();
for (int i = 0; i < spa.length; i++)
{
//保存 参数名称与其位置(次序) 的关系
ht.add(spa[i].parametername.tolower().replace("@", ""), i);
//相当于为存储过程的所有参数赋初值
spa[i].value = null;
}
//得到所有的属性
propertyinfo[] pi = t.getproperties();
foreach (propertyinfo x in pi)
{
if (ht.containskey( x.name.tolower()))
{
//根据参数(属性)名称得到参数的次序!
int i = (int) ht[x.name.tolower()];
if (spa[i].direction == system.data.parameterdirection.input || spa[i].direction == system.data.parameterdirection.inputoutput)
{
object o;
if (x.propertytype.name == "string")
{
o = x.getvalue(instance,null);
if (o != null)
{
string s = util.striphtml((string) o);
o = s;
}
}
else
{
o = x.getvalue(instance,null);
}
spa[i].value = o;
}
}
}
if (t == typeof(rssitem))
{
spa[0].value = ((rssitem) instance).header.url;
}
sqlhelper.executenonquery(this.connection, commandtype.storedprocedure, sp, spa);
if (spa[spa.length – 1].value != system.dbnull.value)
{
system.console.writeline("save to id: {0} successful!", spa[spa.length – 1].value);
}
else
{
system.console.writeline("save failed! may be duplicate!");
}
}
}
}
//==========================================================================================================
/*
–sql script
if exists (select * from dbo.sysobjects where id = object_id(n[dbo].[sp_addchannel]) and objectproperty(id, nisprocedure) = 1)
drop procedure [dbo].[sp_addchannel]
go
if exists (select * from dbo.sysobjects where id = object_id(n[dbo].[sp_addchannelsdetails]) and objectproperty(id, nisprocedure) = 1)
drop procedure [dbo].[sp_addchannelsdetails]
go
if exists (select * from dbo.sysobjects where id = object_id(n[dbo].[channels]) and objectproperty(id, nisusertable) = 1)
drop table [dbo].[channels]
go
if exists (select * from dbo.sysobjects where id = object_id(n[dbo].[channelsdetails]) and objectproperty(id, nisusertable) = 1)
drop table [dbo].[channelsdetails]
go
create table [dbo].[channels] (
[id] [int] identity (1, 1) not null ,
[url] [varchar] (1000) collate chinese_prc_ci_as null ,
[channel] [varchar] (100) collate chinese_prc_ci_as null ,
[title] [varchar] (100) collate chinese_prc_ci_as null ,
[description] [varchar] (1000) collate chinese_prc_ci_as null ,
[link] [varchar] (500) collate chinese_prc_ci_as null ,
[language] [varchar] (10) collate chinese_prc_ci_as null ,
[generator] [varchar] (100) collate chinese_prc_ci_as null ,
[ttl] [varchar] (100) collate chinese_prc_ci_as null ,
[copyright] [varchar] (100) collate chinese_prc_ci_as null ,
[pubdate] [datetime] null ,
[category] [varchar] (100) collate chinese_prc_ci_as null ,
[dclanguage] [varchar] (100) collate chinese_prc_ci_as null
) on [primary]
go
create table [dbo].[channelsdetails] (
[id] [int] identity (1, 1) not null ,
[channelid] [int] null ,
[title] [varchar] (8000) collate chinese_prc_ci_as null ,
[link] [varchar] (8000) collate chinese_prc_ci_as null ,
[description] [varchar] (8000) collate chinese_prc_ci_as null ,
[category] [varchar] (8000) collate chinese_prc_ci_as null ,
[author] [varchar] (8000) collate chinese_prc_ci_as null ,
[pubdate] [datetime] null ,
[comments] [varchar] (8000) collate chinese_prc_ci_as null ,
[guid] [varchar] (8000) collate chinese_prc_ci_as null ,
[trackbackping] [varchar] (8000) collate chinese_prc_ci_as null
) on [primary]
go
set quoted_identifier on
go
set ansi_nulls on
go
create proc sp_addchannel
@url varchar(8000)
,@link varchar(8000)
,@channel varchar(8000)
,@title varchar(8000)
,@image varchar(8000)
,@description varchar(7999)
,@language varchar(8000)
,@generator varchar(8000)
,@ttl varchar(8000)
,@copyright varchar(8000)
,@pubdate datetime
,@category varchar(8000)
,@docs varchar(8000)
,@managingeditor varchar(8000)
,@dclanguage varchar(8000)
,@ int out
as
set @ = 0
insert into channels ([url],[channel],[title],[description],[link],[language],[generator],[ttl],[copyright],[pubdate],[category],[dclanguage])
select @url,@channel,@title,@description,@link,@language,@generator,@ttl,@copyright,@pubdate,@category,@dclanguage
where not exists(select 1 from channels where [url] = @url)
select @ = scope_identity()
go
set quoted_identifier off
go
set ansi_nulls on
go
set quoted_identifier on
go
set ansi_nulls on
go
create proc sp_addchannelsdetails
@url varchar(8000)
,@title varchar(8000)
,@description varchar(7000)
,@link varchar(8000)
,@pubdate datetime
,@category varchar(8000)
,@comments varchar(8000)
,@guid varchar(8000)
,@trackbackping varchar(8000)
,@ int out
as
set @ = 0
insert into channelsdetails ([channelid],[title],[description],[link],[pubdate],[category],[comments],[guid],[trackbackping])
select id,@title,@description,@link,@pubdate,@category,@comments,isnull(@guid,@link),@trackbackping
from channels
where not exists (select 1 from channelsdetails where guid = isnull(@guid,@link)) and url = @url
select @ = scope_identity()
go
set quoted_identifier off
go
set ansi_nulls on
go
*/