国庆闲暇时间,写了一个捕获csdn文章的工具。采用了一些简单的算法,希望csdn 不要见怪。
本来想实现图片自动上传,但是没有空,连文章的doc说明也没有仔细写。:)
开发工具:Eclipse3.0
工作平台:WindowXp
/************************************************
* <p>csdn文章采集工具</p>
* <p>csdn文章采集工具</p>
* <p>CreateData: 2004-10-3 19:59:54</p>
* <p>Description:</p>
* <p>Copyright: Copyright (c) 2004</p>
* <p>Company: 秋水工作室</p>
* @author 王凯
* @version 1.0
***********************************************/
import java.net.*;
import java.sql.*;
import java.io.*;
public class OpenUrl
{
/**
*得到一个网页地址的方法
**/
public String getContent(String strUrl)
// 一个public方法,返回字符串,错误则返回"error open url"
{
try{
URL url=new URL(strUrl);
BufferedReader br=new BufferedReader(new InputStreamReader(url.openStream()));
String s="";
StringBuffer sb=new StringBuffer("");
while((s=br.readLine())!=null)
{
sb.append(s+"\r\n");
}
br.close();
return sb.toString();
}
catch(Exception e){
return "error open url" + strUrl;
}
}
/**
*得到文章并生成页面
*/
public static String GetNews(String Path,String addname,String names){
String body = "";
OpenUrl ou=new OpenUrl();
String htmlbody = ou.getContent(Path);
String title=GetSkip(htmlbody,"<span id=\"ArticleTitle1_ArticleTitle1_lblTitle\">",
"</span>");
String aboutkey = GetSkip(htmlbody,"<span id=\"ArticleTitle1_ArticleTitle1_lblKeywords\">","</span>");
String content = GetSkip(htmlbody,"<span id=\"ArticleContent1_ArticleContent1_lblContent\">","</span>");
System.out.println("title="+title);
// System.out.println("aboutkey="+aboutkey);
// System.out.println("content="+content);
content.replaceAll("'","''");
// content=skipp(content);
// OpenUrl.addnew(title,aboutkey,Path,content);
body = "<html><body><title>"+title+"</title></body></html>"
+ "<body><csdntitle>标题:"+title+"</csdntitle><br>"
+ "<csdnaboutkey>"+aboutkey+"</csdnaboutkey>"
+ "<csdnbody>"+content+"</csdnbody>" +"<body></html>";
OpenUrl.scwj("c:\\csnd\\"+addname,title+".htm",body);
return body;
}
/**
*过滤<p>原代码,已经取消
**/
public static String skipp(String body){
System.out.println("skipi="+body.indexOf("<P "));
while(body.indexOf("<P ")>=0){
int i = body.indexOf("<P ");
String qian = body.substring(0,i);
String hou = body.substring(i);
int k = hou.indexOf(">");
if(k>=0){
hou = hou.substring(k+1);
}
body = qian+hou;
}
while(body.indexOf("<SPAN ")>=0){
int i = body.indexOf("<SPAN ");
String qian = body.substring(0,i);
String hou = body.substring(i);
int k = hou.indexOf(">");
if(k>=0){
hou = hou.substring(k+1);
}
body = qian+hou;
}
body.replaceAll("</SPAN>","");
body.replaceAll("</P>","");
return body;
}
/**
* 得到从spath到epath的内容
**/
public static String GetSkip(String body,String spath,String ePath){
int i = body.indexOf(spath);
String skbody="";
if(i>=0){
skbody = body.substring(i+spath.length(),body.length());
int k = skbody.indexOf(ePath);
if(k>=0){
skbody = skbody.substring(0,k);
}else{
skbody="";
}
}else{
skbody="";
}
return skbody;
}
//具体使用方法
public static void test2(){
OpenUrl ou=new OpenUrl();
String htmlbody = ou.getContent("http://dev.csdn.net/articlelist.aspx?c=6");
while(htmlbody.indexOf("article/")>=0){
int longs = htmlbody.length();
htmlbody = htmlbody.substring(htmlbody.indexOf("article/")+8,longs);
String names = htmlbody.substring(0,htmlbody.indexOf("\" target="));
String path = "http://dev.csdn.net/article/"+names;
System.out.println(path);
int i = names.indexOf("/");
String addname = "";
if (i>=0){
addname = names.substring(0,1);
names =names.substring(i+1);
}
String url = OpenUrl.GetNews(path,addname,names);
}
}
/**
*assess参考方法。
**/
public static boolean addnew(String title,String aboutkey,String pathurl,String body){
boolean addok = false;
String odbcQuery;
Connection odbcconn;
Statement odbcstmt;
ResultSet odbcrs;
try{
Class.forName("sun.jdbc.odbc.JdbcOdbcDriver");
}catch (ClassNotFoundException e)
{ System.out.print ("驱动程序不存在");
}
try{
odbcconn = DriverManager.getConnection("jdbc:odbc:csdn");
odbcstmt = odbcconn.createStatement();
odbcQuery="insert into develop (title,aboutkey,pathurl,body)values('" +
title+"','" +
aboutkey+"','" +
pathurl+"','" +
body+"')";
// System.out.println(odbcQuery);
addok=odbcstmt.execute(odbcQuery);
odbcstmt.close();
odbcconn.close();
}catch (SQLException e)
{ System.out.print (e);
}
return addok;
}
/***
生成页面文章
***/
public static boolean scwj(String path,String FileName,String body){
try {
File f = new File(path);
f.mkdirs();
path=path+"\\"+FileName;
f = new File(path);
PrintWriter out;
out = new PrintWriter(new FileWriter(f));
out.print(body + "\n");
out.close();
} catch (IOException e) {
e.printStackTrace();
} catch (Exception e){
e.printStackTrace();
}
return false;
}
public static void main(String args[])
{
OpenUrl.test2();
// OpenUrl.GetNews("http://dev.csdn.net/article/40/40149.shtm");
}
}
完毕
本文地址:http://com.8s8s.com/it/it14920.htm