日期:2014-05-16  浏览次数:20290 次

今天在nutch1.2用jsoup解析了一下页面,挺爽的。用起来。

/**
?* 此实例用于采集tianya wenda的贴子及回复,组成一个map
?*/
package org.apache.nutch.our;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
?* 页面解析
?*
?* @author LJ
?*
?*/
public class JsoupParse {

??? public static Map<String,List<String>> parser(String content) {
??? ??? Map<String,List<String>> map = new HashMap<String,List<String>>();
??? ??? Document doc = Jsoup.parse(content);
??? ??? // Element body = doc.body();

??? ??? Element titles = doc.select("div.wpcpsCSS").first();
??? ??? String name = titles.ownText();

??? ??? System.out.println("标题" + name);
??? ??? // 获取标题内容
??? ??? Element dep = doc.select("div[style~=(margin:2px 5px 3px 4px)]")
??? ??? ??? ??? .first();
??? ??? String description = dep.text();
??? ??? System.out.println("标题内容" + description);
??? ???
??? ??? String mapString = name+","+description;
??? ???

??? ??? // 回复
??? ??? //利用集合来装多个回复
??? ??? List<String> rs = new ArrayList<String>();
??? ??? Elements replys = doc.select("div[style~=(margin: 2px 5px 3px 4px;)]");
??? ??? for (int i = 0; i < replys.size(); i++) {
??? ??? ??? Element e = replys.get(i);
??? ??? ??? String ry = e.text();
??? ??? ??? rs.add(ry);
??? ??? ??? System.out.println("回复内容: " + ry);
??? ??? }
??? ??? //把问与回复存入map
??? ??? map.put(mapString, rs);
??? ??? return map;

??? ??? // 获取分类
??? ??? // String pattern
??? ??? // ="<a class='wpfitCSS wpfilCSS' id=hover title='[*]' href='label?lid=[(0-9a-zA-Z)*]'>";
??? ??? // Element category = doc.getElementsMatchingOwnText(pattern).first();
??? ??? // System.out.println(category.text());
??? ??? // System.out.println(body);// 获取页面body内容
??? }

??? public static void main(String[] args) throws Exception {
??? ??? //String url = "http://wenda.tianya.cn/wenda/thread?tid=15krbkptkho99qirlp0a5rf2dlrqk443dkhj7";
??? ??? String url2 = "http://wenda.tianya.cn/wenda/thread?tid=15kra997o6kb9p17pvr5fpaj5j8n2ub65sgem";
??? ??? String content = FileDownLoader.doGet(url2, "UTF-8");
??? ??? parser(content);
??? ??? // String content2 = FileDownLoader.doGet(url, "UTF-8");
??? ??? // parser(content2);
??? }

}

?

注:通过查看jsoup API 可以对网页进行深入解析哈。