好文档 - 专业文书写作范文服务资料分享网站

网络爬虫Java实现原理

天下 分享 时间: 加入收藏 我要投稿 点赞

} /**

* Called to start the spider */

public void begin() {

cancel = false;

while ( !getWorkloadWaiting().isEmpty() && !cancel ) { Object list[] = getWorkloadWaiting().toArray(); for ( int i=0;(i

/**

* A HTML parser callback used by this class to detect links *

* @author wuhailin * @version 1.0 */

protected class Parser

extends HTMLEditorKit.ParserCallback { protected URL base;

public Parser(URL base) {

this.base = base; }

public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a,int pos)

{

String href = (String)a.getAttribute(HTML.Attribute.HREF);

if( (href==null) && (t==HTML.Tag.FRAME) ) href = (String)a.getAttribute(HTML.Attribute.SRC);

if ( href==null ) return;

int i = href.indexOf('#'); if ( i!=-1 )

href = href.substring(0,i);

if ( href.toLowerCase().startsWith(\) { report.spiderFoundEMail(href); return; }

handleLink(base,href); }

public void handleStartTag(HTML.Tag t, MutableAttributeSet a,int pos) {

handleSimpleTag(t,a,pos); // handle the same way

}

protected void handleLink(URL base,String str) { try {

URL url = new URL(base,str);

if ( report.spiderFoundURL(base,url) ) addURL(url);

} catch ( MalformedURLException e ) { log(\malformed URL: \+ str ); } } }

/**

* Called internally to log information * This basic method just writes the log * out to the stdout. *

* @param entry The information to be written to the log. */

public void log(String entry) {

System.out.println( (new Date()) + \+ entry ); } }

4.HTMLParse .java

import javax.swing.text.html.*;

public class HTMLParse extends HTMLEditorKit { public HTMLEditorKit.Parser getParser() {

return super.getParser(); } }

网络爬虫Java实现原理

}/***Calledtostartthespider*/publicvoidbegin(){cancel=false;while(!getWorkloadWaiting().isEmpty()&&!cancel){Objectlis
推荐度:
点击下载文档文档为doc格式
0463u8hrgx9da6a52izb
领取福利

微信扫码领取福利

微信扫码分享