[原创代码]XMLHTTP批量抓取远程资料

类别:Asp 点击:0 评论:0 推荐:

可以在此基础上结合正则表达式做成更好的效果,希望大家能分享一下Xmlhttp的session共享技术

<html>
<head>
<title>AUTOGET</title>
<meta http-equiv="Content-Type" content="text/html; charset=gb2312">
</head>
<body bgcolor="#FFFFFF" style="font-family:Arial;font-size:12px">
<%
'=================================================
'FileName: Getit.Asp
'Intro : Auto Get Data From Remote WebSite
'Author: Babyt(阿泰)
'URL: http://blog.csdn.net/babyt
'CreateAt: 2002-02  LastUpdate:2004-09
'DB Table : data
'Table Field:
' UID -> Long -> Keep ID Of the pages
' UContent -> Text -> Keep Content Of the Pages(HTML)
'=================================================

Server.ScriptTimeout=5000

'on error resume next
Set conn = Server.CreateObject("ADODB.Connection")
conn.open "Provider=Microsoft.Jet.OLEDB.4.0;Data Source=" & Server.MapPath("getit.mdb")
Set rs = Server.CreateObject("ADODB.Recordset")
sql="select * from data"
rs.open sql,conn,1,3

Dim comeFrom,myErr,myCount

'========================================================
comeFrom="http://www.xxx.com/U.asp?ID="
myErr1="该资料不存在"
myErr2="该资料已隐藏"
'========================================================

'***************************************************************
' 只需要更改这里 i 的始点intMin和终点intMax,设定步长intStep
' 每次区间设置成5万左右。估计要两个多小时。期间不需要人工干预
'****************************************************************
intMin=0
intMax=10000
'设定步长
intStep=100

'==========================================================
'以下代码不要更改
'==========================================================
Call GetPart (intMin)
Response.write "已经转换完成" & intMin & "~~" & intMax & "之间的数据"
rs.close
Set rs=Nothing
conn.Close
set conn=nothing
%>
</body>
</html>
<%
'使用XMLHTTP抓取地址并进次内容处理
Function GetBody(Url)
        Dim objXML
        On Error Resume Next
        Set objXML = CreateObject("Microsoft.XMLHTTP")
        With objXML
        .Open "Get", Url, False, "", ""
        .Send
        GetBody = .ResponseBody
        End With
        GetBody=BytesToBstr(GetBody,"GB2312")
        Set objXML = Nothing
End Function
'使用Adodb.Stream处理二进制数据
Function BytesToBstr(strBody,CodeBase)
        dim objStream
        set objStream = Server.CreateObject("Adodb.Stream")
        objStream.Type = 1
        objStream.Mode =3
        objStream.Open
        objStream.Write strBody
        objStream.Position = 0
        objStream.Type = 2
        objStream.Charset = CodeBase
        BytesToBstr = objStream.ReadText
        objStream.Close
        set objStream = nothing
End Function
'主函数
Function GetPart(iStart)
 Dim iGo
 time1=timer()
 myCount=0
 For iGo=iStart To iStart+intStep
  If iGo<=intMax Then
   Response.Execute comeFrom & iGo
   '进行简单的数据处理
   content = GetBody(comeFrom & iGo ) 
   content = Replace(content,chr(34),"”")
   If  instr(content,myErr1) OR instr(content,myErr2)  Then
    '跳过错误信息
   Else  
    '写入数据库
    rs.AddNew
    rs("UID")=iGo
    '********************************  
    rs("UContent")=Replace(content,"”",chr(34))
    '*********************************
    rs.update
    myCount=myCount+1
    Response.Write iGo & "<BR>"
    Response.Flush
   End If   
  Else
   Response.write "<font color=red>成功抓取"&myCount&"条记录,"
   time2=timer()
   Response.write "耗时:" & Int(FormatNumber((time2-time1)*1000000,3)) & " 秒</font><BR>"
   Response.Flush
   Exit Function
  End If
 Next
 Response.write "<font color=red>成功抓取"&myCount&"条记录,"
 time2=timer()
 Response.write "耗时:" & CInt(FormatNumber((time2-time1),3)) & " 秒</font><BR>"
 Response.Flush
 '递归
 GetPart(iGo+1)
End Function%>

本文地址:http://com.8s8s.com/it/it8704.htm