您好,欢迎来到华拓科技网。
搜索
您的当前位置:首页网络爬虫源代码

网络爬虫源代码

来源:华拓科技网


网络爬虫源代码.txt丶︶ ̄喜欢的歌,静静的听,喜欢的人,远远的看我笑了当初你不挺傲的吗现在您这是又玩哪出呢?public class Spider implements Runnable

{

private ArrayList urls; //URL列表

private HashMap indexedURLs; //已经检索过的URL列表

private int threads ; //初始化线程数

public static void main(String argv[]) throws Exception

{

if(argv[0] == null)

{

System.out.println(\"Missing required argument: [Sit URL]\");

return ;

}

Spider Spider = new Spider(argv[0]);

Spider.go();

}

public Spider(String strURL)

{

urls = new ArrayList();

threads = 10;

urls.add(strURL);

threadList = new ArrayList();

indexedURLs = new HashMap();

if (urls.size() == 0)

throw new IllegalArgumentException(\"Missing required argument: -u [start url]\");

if (threads < 1)

(\"Invalid number of threads: \" +

threads);

}

public void go(String strURL) throws Exception

{

// index each entry point URL

long start = System.currentTimeMillis();

for (int i = 0; i < threads; i++) {

Thread t = new Thread(this, \"Spide \" + (i+1));

t.start();

threadList.add(t);

}

while (threadList.size() >; 0) {

Thread child = (Thread)threadList.remove(0);

child.join();

}

long elapsed = System.currentTimeMillis() - start;

}

public void run() {

String url;

try {

while ((url = dequeueURL()) != null) {

indexURL(url);

}

}catch(Exception e) {

logger.info(e.getMessage());

}

}

//检测URL列表容器中有没有URL没有被解析,如果有则返回URL由线程继续执行

public synchronized String dequeueURL() throws Exception {

while (true) {

if (urls.size() >; 0)

{

return (String)urls.remove(0);

}

else {

threads--;

if (threads >; 0)

{

wait();

threads++;

}

else

{

notifyAll();

return null;

}

}

}

}

/*

* 添加URL和当前URL的级数,并唤醒睡眠线程

*/

public synchronized void enqueueURL(String url,int level)

{

if (indexedURLs.get(url) == null)

{

urls.add(url);

indexedURLs.put(url, new Integer(level));

notifyAll();

}

}

/**

* 通过URL解析出网页内容并解析出页面上的URL

* @param url 页面链接

* @throws java.lang.Exception

*/

private void indexURL(String url) throws Exception

{

boolean flag = true ;

//判断网页链接的级别,系统默认为三级

int level = 1 ;

if (indexedURLs.get(url) == null)

{

indexedURLs.put(url, new Integer(level));

}

else{

level = ((Integer)indexedURLs.get(url)).intValue();

//只检测到页面的第二级

if(level >; 2 )

return ;

level++ ;

}

String strBody = null ;

try{

//解析页面内容

strBody = loadURL(url);

}catch(Exception e){

return ;

}

if (strBody != null) {

String urlGroups[] = null ;

try{

//解析出页面所以URL

urlGroups = parseURLs(summary);

}catch(Exception e){

logger.info(e.getMessage());

}

if(urlGroups == null)

urlGroups = new String[0] ;

strBody = null ;

for (int i = 0; i < urlGroups.length; i++) {

enqueueURL(urlGroups[i],level);

}

}

}

}

因篇幅问题不能全部显示,请点此查看更多更全内容

Copyright © 2019- huatuo6.cn 版权所有 赣ICP备2024042791号-9

违法及侵权请联系:TEL:199 18 7713 E-MAIL:2724546146@qq.com

本站由北京市万商天勤律师事务所王兴未律师提供法律服务