用httpunit写的spider程序:可以监测网站的错误页面!

类别:Java 点击:0 评论:0 推荐:

这个程序出自Java Tools for Extreme Programming一书。

import com.meterware.httpunit.*;
import java.util.HashSet;
import java.util.Set;

public class CheckSite {

 private WebConversation conversation;

 private Set checkedLinks;

 private String host = "www.sohu.com";

 public static void main(String[] args) throws Exception {
  CheckSite cs = new CheckSite();
  cs.setUp();
  cs.testEntireSite();
 }

 public void setUp() {
  conversation = new WebConversation();
  checkedLinks = new HashSet();
 }

 public void testEntireSite() throws Exception {
  WebResponse response = conversation.getResponse("http://" + host);
  checkAllLinks(response);
  System.out.println("Site check finished. Link's checked: "
    + checkedLinks.size() + " : " + checkedLinks);
 }

 private void checkAllLinks(WebResponse response) throws Exception {
  if (!isHtml(response)) {
   return;
  }
  WebLink[] links = response.getLinks();
  System.out.println(response.getTitle() + " -- links found = "
    + links.length);
  for (int i = 0; i < links.length; i++) {
   boolean newLink = checkedLinks.add(links[i].getURLString());
   if (newLink) {
    System.out.println("Total links checked so far: "
      + checkedLinks.size());
    checkLink(links[i]);
   }
  }
 }

 private boolean isHtml(WebResponse response) {
  return response.getContentType().equals("text/html");
 }

 private void checkLink(WebLink link) throws Exception {
  WebRequest request = link.getRequest();
  java.net.URL url = request.getURL();
  System.out.println("checking link: " + url);
  String linkHost = url.getHost();
  if (linkHost.equals(this.host)) {
   WebResponse response = conversation.getResponse(request);
   this.checkAllLinks(response);
  }

 }
}

本文地址:http://com.8s8s.com/it/it12716.htm