使用java.net的方法得到网站页面内的文章 并生成文件的方法

类别:Java 点击:0 评论:0 推荐:

国庆闲暇时间,写了一个捕获csdn文章的工具。采用了一些简单的算法,希望csdn 不要见怪。
本来想实现图片自动上传,但是没有空,连文章的doc说明也没有仔细写。:)
开发工具:Eclipse3.0
工作平台:WindowXp

/************************************************
 * <p>csdn文章采集工具</p>
 * <p>csdn文章采集工具</p>
 * <p>CreateData: 2004-10-3  19:59:54</p>
 * <p>Description:</p>
 * <p>Copyright: Copyright (c) 2004</p>
 * <p>Company: 秋水工作室</p>
 * @author 王凯
 * @version 1.0
 ***********************************************/
import java.net.*;
import java.sql.*;
import java.io.*;

public class OpenUrl
{
/**
 *得到一个网页地址的方法
 **/
 public String getContent(String strUrl)
 // 一个public方法,返回字符串,错误则返回"error open url"
 {
  try{
  
   URL url=new URL(strUrl);
   BufferedReader br=new BufferedReader(new InputStreamReader(url.openStream()));
   String s="";
   StringBuffer sb=new StringBuffer("");
   while((s=br.readLine())!=null)
   {    
    sb.append(s+"\r\n");   
   }
   br.close();
   return sb.toString();
  }
  catch(Exception e){
   return "error open url" + strUrl;
  
  } 
 }
 

/**
 *得到文章并生成页面
*/
 public static String GetNews(String Path,String addname,String names){
  String body = "";
 OpenUrl ou=new OpenUrl();
 String htmlbody = ou.getContent(Path);
 String title=GetSkip(htmlbody,"<span id=\"ArticleTitle1_ArticleTitle1_lblTitle\">",
 "</span>");
 String aboutkey = GetSkip(htmlbody,"<span id=\"ArticleTitle1_ArticleTitle1_lblKeywords\">","</span>");
 String content = GetSkip(htmlbody,"<span id=\"ArticleContent1_ArticleContent1_lblContent\">","</span>");
 
 System.out.println("title="+title);
// System.out.println("aboutkey="+aboutkey);
// System.out.println("content="+content);
 content.replaceAll("'","''");
// content=skipp(content);
// OpenUrl.addnew(title,aboutkey,Path,content);
 body = "<html><body><title>"+title+"</title></body></html>"
   + "<body><csdntitle>标题:"+title+"</csdntitle><br>"
   + "<csdnaboutkey>"+aboutkey+"</csdnaboutkey>"
   + "<csdnbody>"+content+"</csdnbody>" +"<body></html>";
 OpenUrl.scwj("c:\\csnd\\"+addname,title+".htm",body);
 return body;
 }
 
 
/**
 *过滤<p>原代码,已经取消
**/
 public static String skipp(String body){
  System.out.println("skipi="+body.indexOf("<P "));
  while(body.indexOf("<P ")>=0){
   int i = body.indexOf("<P ");
   String qian = body.substring(0,i);
   String hou = body.substring(i);
   int k = hou.indexOf(">");
   if(k>=0){
    hou = hou.substring(k+1);
   }
   body = qian+hou;
  }
 while(body.indexOf("<SPAN ")>=0){
  int i = body.indexOf("<SPAN ");
  String qian = body.substring(0,i);
  String hou = body.substring(i);
  int k = hou.indexOf(">");
  if(k>=0){
   hou = hou.substring(k+1);
  }
  body = qian+hou;
 }
 body.replaceAll("</SPAN>","");
 body.replaceAll("</P>","");
  return body;
 }

/**
 * 得到从spath到epath的内容
**/
 public static String GetSkip(String body,String spath,String ePath){
  int i = body.indexOf(spath);
 String skbody="";
  if(i>=0){
  skbody = body.substring(i+spath.length(),body.length());
   int k = skbody.indexOf(ePath);
   if(k>=0){
   skbody = skbody.substring(0,k);
   }else{
   skbody="";
   }
  }else{
  skbody="";
  }
  return skbody;
 }


 //具体使用方法 
 public static void test2(){
 OpenUrl ou=new OpenUrl();
 String htmlbody = ou.getContent("http://dev.csdn.net/articlelist.aspx?c=6");
 while(htmlbody.indexOf("article/")>=0){
   int longs = htmlbody.length();
   htmlbody = htmlbody.substring(htmlbody.indexOf("article/")+8,longs);
   String names = htmlbody.substring(0,htmlbody.indexOf("\" target="));
   String path = "http://dev.csdn.net/article/"+names;
   System.out.println(path);
   int i = names.indexOf("/");
   String addname = "";
   if (i>=0){
  addname = names.substring(0,1);
  names =names.substring(i+1);
   }
   String url = OpenUrl.GetNews(path,addname,names);

 }
 
 }
 
/**
 *assess参考方法。
**/
 public static boolean addnew(String title,String aboutkey,String pathurl,String body){
  boolean addok = false;
 String odbcQuery;
 Connection odbcconn;
 Statement odbcstmt;
 ResultSet odbcrs;

  try{
  Class.forName("sun.jdbc.odbc.JdbcOdbcDriver");
 }catch (ClassNotFoundException e)
 { System.out.print ("驱动程序不存在");
 } 
 try{
 odbcconn = DriverManager.getConnection("jdbc:odbc:csdn");
 odbcstmt = odbcconn.createStatement();     
 odbcQuery="insert into develop (title,aboutkey,pathurl,body)values('" +
   title+"','" +
   aboutkey+"','" +
   pathurl+"','" +
   body+"')";
// System.out.println(odbcQuery);
 addok=odbcstmt.execute(odbcQuery);
 odbcstmt.close();
 odbcconn.close();
 }catch (SQLException e)
 { System.out.print (e);
 }

  return addok;
 }
 
/***
生成页面文章
***/
 public static boolean scwj(String path,String FileName,String body){
  try {
   File f = new File(path);
   f.mkdirs();
   path=path+"\\"+FileName;
   f = new File(path);   
   PrintWriter out;
   out = new PrintWriter(new FileWriter(f));
   out.print(body + "\n");
   out.close();
  } catch (IOException e) {
   e.printStackTrace();
  } catch (Exception e){
   e.printStackTrace();
  }
  return false;
 }
 
 public static void main(String args[])
 {
  OpenUrl.test2();
//  OpenUrl.GetNews("http://dev.csdn.net/article/40/40149.shtm");
 }
 
}
完毕

本文地址:http://com.8s8s.com/it/it14920.htm