可以在此基础上结合正则表达式做成更好的效果,希望大家能分享一下xmlhttp的session共享技术
<html>
<head>
<title>autoget</title>
<meta http-equiv="content-type" content="text/html; charset=gb2312">
</head>
<body bgcolor="#ffffff" style="font-family:arial;font-size:12px">
<%
=================================================
filename: getit.asp
intro : auto get data from remote website
author: babyt(阿泰)
url: http://blog.csdn.net/babyt
createat: 2002-02 lastupdate:2004-09
db table : data
table field:
uid -> long -> keep id of the pages
ucontent -> text -> keep content of the pages(html)
=================================================
server.scripttimeout=5000
on error resume next
set conn = server.createobject("adodb.connection")
conn.open "provider=microsoft.jet.oledb.4.0;data source=" & server.mappath("getit.mdb")
set rs = server.createobject("adodb.recordset")
sql="select * from data"
rs.open sql,conn,1,3
dim comefrom,myerr,mycount
========================================================
comefrom="http://www.xxx.com/u.asp?id="
myerr1="该资料不存在"
myerr2="该资料已隐藏"
========================================================
***************************************************************
只需要更改这里 i 的始点intmin和终点intmax,设定步长intstep
每次区间设置成5万左右。估计要两个多小时。期间不需要人工干预
****************************************************************
intmin=0
intmax=10000
设定步长
intstep=100
==========================================================
以下代码不要更改
==========================================================
call getpart (intmin)
response.write "已经转换完成" & intmin & "~~" & intmax & "之间的数据"
rs.close
set rs=nothing
conn.close
set conn=nothing
%>
</body>
</html>
<%
使用xmlhttp抓取地址并进次内容处理
function getbody(url)
dim objxml
on error resume next
set objxml = createobject("microsoft.xmlhttp")
with objxml
.open "get", url, false, "", ""
.send
getbody = .responsebody
end with
getbody=bytestobstr(getbody,"gb2312")
set objxml = nothing
end function
使用adodb.stream处理二进制数据
function bytestobstr(strbody,codebase)
dim objstream
set objstream = server.createobject("adodb.stream")
objstream.type = 1
objstream.mode =3
objstream.open
objstream.write strbody
objstream.position = 0
objstream.type = 2
objstream.charset = codebase
bytestobstr = objstream.readtext
objstream.close
set objstream = nothing
end function
主函数
function getpart(istart)
dim igo
time1=timer()
mycount=0
for igo=istart to istart+intstep
if igo<=intmax then
response.execute comefrom & igo
进行简单的数据处理
content = getbody(comefrom & igo )
content = replace(content,chr(34),"”")
if instr(content,myerr1) or instr(content,myerr2) then
跳过错误信息
else
写入数据库
rs.addnew
rs("uid")=igo
********************************
rs("ucontent")=replace(content,"”",chr(34))
*********************************
rs.update
mycount=mycount+1
response.write igo & "<br>"
response.flush
end if
else
response.write "<font color=red>成功抓取"&mycount&"条记录,"
time2=timer()
response.write "耗时:" & int(formatnumber((time2-time1)*1000000,3)) & " 秒</font><br>"
response.flush
exit function
end if
next
response.write "<font color=red>成功抓取"&mycount&"条记录,"
time2=timer()
response.write "耗时:" & cint(formatnumber((time2-time1),3)) & " 秒</font><br>"
response.flush
递归
getpart(igo+1)
end function%>