【JAVA】java解析HTML代码_java怎么解析html代码_poolsnowhui的博客

【开发环境】

1.Eclipse ，JDK1.7，Windows。 2.第三方jar包， jsoup-1.8.2.jar 。（MVN仓库，可以下载jar包） 3.源代码HtmlParser.java。 import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.util.ArrayList; import java.util.LinkedList; import java.util.List; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; * Jsoup解析html标签时类似于JQuery的一些符号 * @author chixh public class HtmlParser { protected List<List<String>> data = new LinkedList<List<String>>(); * 获取value值 * @param e * @return public static String getValue(Element e) { return e.attr("value"); * </tr> * 之间的文本 * @param e * @return public static String getText(Element e) { return e.text(); * 识别属性id的标签,一般一个html页面id唯一 * @param body * @param id * @return public static Element getID(String body, String id) { Document doc = Jsoup.parse(body); // 所有#id的标签 Elements elements = doc.select("#" + id); // 返回第一个 return elements.first(); * 识别属性class的标签 * @param body * @param class * @return public static Elements getClassTag(String body, String classTag) { Document doc = Jsoup.parse(body); // 所有#id的标签 return doc.select("." + classTag); * 获取tr标签元素组 * @param e * @return public static Elements getTR(Element e) { return e.getElementsByTag("tr"); * 获取td标签元素组 * @param e * @return public static Elements getTD(Element e) { return e.getElementsByTag("td"); * 获取表元组 * @param table * @return public static List<List<String>> getTables(Element table){ List<List<String>> data = new ArrayList<>(); for (Element etr : table.select("tr")) { List<String> list = new ArrayList<>(); for (Element etd : etr.select("td")) { String temp = etd.text(); //增加一行中的一列 list.add(temp); //增加一行 data.add(list); return data; * 读html文件 * @param fileName * @return public static String readHtml(String fileName){ FileInputStream fis = null; StringBuffer sb = new StringBuffer(); try { fis = new FileInputStream(fileName); byte[] bytes = new byte[1024]; while (-1 != fis.read(bytes)) { sb.append(new String(bytes)); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { try { fis.close(); } catch (IOException e1) { e1.printStackTrace(); return sb.toString(); public static void main(String[] args) { // String url = "http://www.baidu.com"; // String body = HtmlBody.getBody(url); // System.out.println(body); Document doc = Jsoup.parse(readHtml("./index.html")); // 获取html的标题 String title = doc.select("title").text(); System.out.println(title); // 获取按钮的文本 String btnText = doc.select("div div div div div form").select("#su").attr("value"); System.out.println(btnText); // 获取导航栏文本 Elements elements = doc.select(".head_wrapper").select("#u1").select("a"); for (Element e : elements) { System.out.println(e.text()); Document doc2 = Jsoup.parse(readHtml("./table.html")); Element table = doc2.select("table").first(); List<List<String>> list = getTables(table); for (List<String> list2 : list) { for (String string : list2) { System.out.print(string+","); System.out.println();

【解析的HTML页面文件】

//index.html

<!DOCTYPE html>
<!--STATUS OK-->
<meta http-equiv=content-type content=text/html;charset=utf-8>
<meta http-equiv=X-UA-Compatible content=IE=Edge>
<meta content=always name=referrer>
<link rel=stylesheet type=text/css href=http://s1.bdstatic.com/r/www/cache/bdorz/baidu.min.css>
<title>百度一下，你就知道</title>
</head>
<body link=#0000cc>
	<div id=wrapper>
		<div id=head>
			<div class=head_wrapper>
				<div class=s_form>
					<div class=s_form_wrapper>
						<div id=lg>
							<img hidefocus=true src=//www.baidu.com/img/bd_logo1.png width=270 height=129>
						<form id=form name=f action=//www.baidu.com/s class=fm>
							<input type=hidden name=bdorz_come value=1>
							<input type=hidden name=ie value=utf-8>
							<input type=hidden name=f value=8>
							<input type=hidden name=rsv_bp value=1>
							<input type=hidden name=rsv_idx value=1>
							<input type=hidden name=tn value=baidu>
							<span class="bg s_ipt_wr">
								<input id=kw name=wd class=s_ipt value maxlength=255 autocomplete=off autofocus>
							</span>
							<span class="bg s_btn_wr">
								<input type=submit id=su value=百度一下 class="bg s_btn">
							</span>
						</form>
				<div id=u1>
					<a href=http://news.baidu.com name=tj_trnews class=mnav>新闻</a>
					<a href=http://www.hao123.com name=tj_trhao123 class=mnav>hao123</a>
					<a href=http://map.baidu.com name=tj_trmap class=mnav>地图</a>
					<a href=http://v.baidu.com name=tj_trvideo class=mnav>视频</a>
					<a href=http://tieba.baidu.com name=tj_trtieba class=mnav>贴吧</a>
					<noscript>
						<a href=http://www.baidu.com/bdorz/login.gif?login&tpl=mn&u=http%3A%2F%2Fwww.baidu.com%2f%3fbdorz_come%3d1 name=tj_login class=lb>登录</a>
					</noscript>
					<script>document.write('<a href="http://www.baidu.com/bdorz/login.gif?login&tpl=mn&u='+ encodeURIComponent(window.location.href+ (window.location.search === "" ? "?" : "&")+ "bdorz_come=1")+ '" name="tj_login" class="lb">登录</a>');</script>
					<a href=//www.baidu.com/more / name=tj_briicon class=bri style="display: block;">更多产品</a>
		<div id=ftCon>
			<div id=ftConw>
				<p id=lh>
					<a href=http://home.baidu.com>关于百度</a>
					<a href=http://ir.baidu.com>About Baidu</a>
				<p id=cp>
					&copy;2017 Baidu 
					<a href=http://www.baidu.com/duty />
					使用百度前必读
					<a href=http://jianyi.baidu.com / class=cp-feedback>意见反馈</a>
					 京ICP证030173号  <img src=//www.baidu.com/img/gs.gif>
</body>
</html>
	<table border="0" width="750" bgcolor="#000000" cellspacing="1" cellpadding="2">
		<tr bgcolor="#efefef">
			<td width="80">基金代码</td>
			<td width="100">基金名称</td>
			<td>单位基金净值(元)</td>
			<td>单位累计净值(元)</td>
			<td>年中年末份额净值(元)</td>
			<td>年中年末累计净值(元)</td>
			<td>基金资产净值(元)</td>
			<td></td>
			<td></td>
		<tr bgcolor="#FFFFFF" height="30">
			<td>010101</td>
			<td>天弘股票基金</td>
			<td style="color: blue">2.100</td>
			<td style="color: blue">4.001</td>
			<td style="color: blue"></td>
			<td style="color: blue"></td>
			<td style="color: blue"></td>
			<td style="color: blue"></td>
			<td style="color: blue"></td>
	</table>
</body>
</html>

【程序结果】

HTML在运用于web端，手机端越来越频繁。采用python等脚本语言，能够读取HTML代码。这里介绍采用java程序来读取HTML代码，由于java的通用性，可以很好的解析HTML中的数据，并存放到数据库中。读取网页中的HTML代码，见博客【JAVA】JAVA程序根据url请求网站HTML页面【开发环境】1.Eclipse ，JDK1.7，Windows。2.第三方jar包，js

Jsoup 解析 xml Jsoup：jsoup 是一款 Java 的 HTML 解析器，可直接解析某个URL地址、 HTML 文本内容。它提供了一套非常省力的API，可通过DOM，CSS以及类似于jQuery的操作方法来取出和操作数据。 1. 导入jar包 2. 获取Document对象 3. 获取对应的标签Element对象 4. 获取数据 user.xml <?x...

使用Jsoup 解析 html 代码的示例。以科资讯网站cnbeta为例，demo中点击按钮会获取cnbeta的首页 html ，然后使用Jsoup 解析，在屏幕上显示当前页的所有文章标题及连接。使用到的Jsoup的jar包也打包在内

import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java .io.IOException; public class HTML Parser { public static void main(String[] args) throws IOException { // 使用 Jsoup 解析 HTML Document doc = Jsoup.connect("https://www.example.com").get(); // 选择所有的标题元素 Elements titles = doc.select("h1"); for (Element title : titles) { System.out.println("标题：" + title.text()); 在这段代码中，我们使用了 Jsoup 库解析 HTML ，然后选择了所有的标题元素（标签为 `h1`）并输出其文本内容。