diff --git a/src/main/java/com/dsideal/base/Test/TestReadHtml.java b/src/main/java/com/dsideal/base/Test/TestReadHtml.java index 694f9137..ad879492 100644 --- a/src/main/java/com/dsideal/base/Test/TestReadHtml.java +++ b/src/main/java/com/dsideal/base/Test/TestReadHtml.java @@ -16,6 +16,7 @@ import java.io.IOException; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL; +import java.util.ArrayList; import java.util.List; public class TestReadHtml { @@ -41,28 +42,29 @@ public class TestReadHtml { * @return * @throws IOException */ - public static String getXq(String areaName) throws IOException { - String res = ""; + public static List getXq(String areaName) throws IOException { + List list = new ArrayList<>(); String url = "https://baike.baidu.com/item/" + areaName + "?fromModule=lemma_search-box"; String htmlContent = getHTML(url); - // 从字符串解析HTML Document doc = Jsoup.parse(htmlContent); - // 选择所有span标签 Elements dts = doc.select("dt"); - - // 遍历所有span标签 + // 遍历所有dt标签 for (Element dt : dts) { if (dt.text().equals("下辖地区")) { - System.out.println(dt.nextElementSibling().text()); + // 使用正则表达式替换掉以[]包含的部分 + String output = dt.nextElementSibling().text().replaceAll("\\[.*?\\]", ""); + output = output.trim(); + list.add(output); } - if (dt.text().replace(" ", "").equals("面积")) { - System.out.println(dt.nextElementSibling().text()); + String output = dt.nextElementSibling().text().replaceAll("\\[.*?\\]", ""); + output = output.replace("km²", ""); + output = output.trim(); + list.add(output); } } - - return res; + return list; } public static void main(String[] args) throws NoApiKeyException, InputRequiredException, IOException, InterruptedException { @@ -73,7 +75,7 @@ public class TestReadHtml { for (Record record : list) { String areaName = record.getStr("full_name"); String cityName = record.getStr("city_name"); - String res = getXq(areaName); + List res = getXq(areaName); System.out.println(cityName + "\t" + areaName + "\t" + res); Thread.sleep(1000); }