Description
import java.text.SimpleDateFormat;
import java.util.Date;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.AfterExtractor;
import us.codecraft.webmagic.model.ConsolePageModelPipeline;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.HelpUrl;
import us.codecraft.webmagic.model.annotation.TargetUrl;
@targeturl("http://www.jokeji.cn/jokehtml/([\\w\\W]*)/\\d+.htm")
@HelpUrl("http://www.jokeji.cn/list_\\d+.htm")
@ExtractBy(value = "//span[@id='text110']/p",multi = true)
public class JokeModel implements AfterExtractor{
@ExtractBy(value = "//allText()")
private String joke;
private Date creat_time;
public static void main(String[] args) {
OOSpider.create(Site.me().setSleepTime(1000)
, new ConsolePageModelPipeline(), JokeModel.class)
.addUrl("http://www.jokeji.cn/list.htm").thread(5).run();
}
@Override
public void afterProcess(Page page) {
System.out.println("gegegegeeg");
creat_time = new Date();
}
@Override
public String toString() {
// TODO Auto-generated method stub
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:dd");
String time = sdf.format(creat_time);
return "{\"joke\":\""+joke+"\",\"create_time\":\""+time+"\"}";
}
}
上面的是代码 一个bug是,不加multi = true 的话只能取到第一个
下的数据,应该是不加就能判断是否是多条吧 还有一个是@ExtractBy(value = "//allText()") text 取不到数据 allText tinyText 取不到