webcollector爬虫采集java代码模板(附带源码地址)
package work;
import org.jsoup.Jsoup;
import org.jsoup.safety.Whitelist;
import org.springframework.dao.DuplicateKeyException;
import org.springframework.jdbc.core.JdbcTemplate;
import cn.edu.hfut.dmic.contentextractor.ContentExtractor;
import cn.edu.hfut.dmic.contentextractor.News;
import cn.edu.hfut.dmic.webcollector.conf.Configuration;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;
import cn.edu.hfut.dmic.webcollector.plugin.net.OkHttpRequester;
import db.JDBCHelper;
import okhttp3.Request;
import util.HtmlTools;
/**
* Crawling news from hfut news
* use 2.72 lib
* @author hu
*/
public class ChujiingNewstpl extends BreadthCrawler {
//种子 url
public String seedUrl="http://news.cnhubei.com/";
//需要采集的内容页url
public String contentRegUrl="http://news.cnhubei.com/.*/p/.*?.html\\.*";
//线程数量
public int threads_num=10;
//每次迭代爬取的网页数量上限
public int topn_num=10;
//爬取文章深度
public static int levelnum=10;
//停止后能否继续上次采集
public static boolean resumable=true;
public int executeTime=20000; //ms
public static int MaxExecuteCount=2;
public int connectTimeout=50;
public int readTimeout=60;
private String contentTable="news_content";
@Override
public void visit(Page page, CrawlDatums next) {
// String url = page.url();
if (page.matchUrl(contentRegUrl)) {
//
/*extract title and content of news by css selector*/
// String title = page.select("div[id=Article]>h3").first().text();
// String content = page.selectText("div#artibody");
News n = null;
try {
n=ContentExtractor.getNewsByHtml(page.html());
String title=n.getTitle();
String content=n.getContent();
content = Jsoup.clean(content, HtmlTools.getWhitelist());
content=HtmlTools.stripNewLine(content);
title=Jsoup.clean(title,Whitelist.none());
title=title.trim();
System.out.println(" get content :"+title );
if(!title.isEmpty() && !content.isEmpty()) {
ChujiingNewstpl.dbHandler.update("insert into "+contentTable+"(title,content) value(?,?)",title,content);
}
} catch(DuplicateKeyException e) {
System.out.println(" duplicate item ");
}catch (Exception e) {
// TODO Auto-generated catch block
System.out.println(e.getMessage());
}
}
}
private static JdbcTemplate dbHandler;
// 自定义的请求插件
public class MyRequester extends OkHttpRequester {
String userAgent = "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)";
// String cookie = "name=abcdef";
// 每次发送请求前都会执行这个方法来构建请求
@Override
public Request.Builder createRequestBuilder(CrawlDatum crawlDatum) {
// 这里使用的是OkHttp中的Request.Builder
// 可以参考OkHttp的文档来修改请求头
// System.out.println("request with cookie: " + cookie);
return super.createRequestBuilder(crawlDatum).header("User-Agent", userAgent);
// .header("Cookie", cookie);
}
}
public ChujiingNewstpl(String crawlPath, boolean autoParse) {
super(crawlPath, autoParse);
// 设置请求插件
//setRequester(new MyRequester());
/*start page*/
this.addSeed(seedUrl);
this.addRegex(contentRegUrl);
this.addRegex("-.*\\.(jpg|png|gif|css|js|font).*");
setThreads(threads_num);
Configuration cnf=getConf();
cnf.setTopN(topn_num);
// cnf.setExecuteInterval(executeTime);
// cnf.setConnectTimeout(connectTimeout);
// cnf.setReadTimeout(readTimeout);
}
public static void main(String[] args) throws Exception {
dbHandler=JDBCHelper.db();
ChujiingNewstpl crawler = new ChujiingNewstpl("spiderdata"+java.io.File.separator+ChujiingNewstpl.class.getName(), true);
crawler.setResumable(resumable);
crawler.start(levelnum);
//失败最大尝试次数
crawler.setMaxExecuteCount(MaxExecuteCount);
}
}
源码地址 https://down.51cto.com/data/2461609
湖里网站建设公司创新互联,湖里网站设计制作,有大型网站制作公司丰富经验。已为湖里1000多家提供企业网站建设服务。企业网站搭建\外贸网站制作要多少钱,请找那个售后服务好的湖里做网站的公司定做!
文章名称:webcollector爬虫采集java代码模板(附带源码地址)
本文路径:http://ybzwz.com/article/iphcee.html