main
黄海 8 months ago
parent 8dc0b6aecd
commit dcb66ad617

@ -16,6 +16,7 @@ import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
public class TestReadHtml {
@ -41,28 +42,29 @@ public class TestReadHtml {
* @return
* @throws IOException
*/
public static String getXq(String areaName) throws IOException {
String res = "";
public static List<String> getXq(String areaName) throws IOException {
List<String> list = new ArrayList<>();
String url = "https://baike.baidu.com/item/" + areaName + "?fromModule=lemma_search-box";
String htmlContent = getHTML(url);
// 从字符串解析HTML
Document doc = Jsoup.parse(htmlContent);
// 选择所有span标签
Elements dts = doc.select("dt");
// 遍历所有span标签
// 遍历所有dt标签
for (Element dt : dts) {
if (dt.text().equals("下辖地区")) {
System.out.println(dt.nextElementSibling().text());
// 使用正则表达式替换掉以[]包含的部分
String output = dt.nextElementSibling().text().replaceAll("\\[.*?\\]", "");
output = output.trim();
list.add(output);
}
if (dt.text().replace(" ", "").equals("面积")) {
System.out.println(dt.nextElementSibling().text());
String output = dt.nextElementSibling().text().replaceAll("\\[.*?\\]", "");
output = output.replace("km²", "");
output = output.trim();
list.add(output);
}
}
return res;
return list;
}
public static void main(String[] args) throws NoApiKeyException, InputRequiredException, IOException, InterruptedException {
@ -73,7 +75,7 @@ public class TestReadHtml {
for (Record record : list) {
String areaName = record.getStr("full_name");
String cityName = record.getStr("city_name");
String res = getXq(areaName);
List<String> res = getXq(areaName);
System.out.println(cityName + "\t" + areaName + "\t" + res);
Thread.sleep(1000);
}

Loading…
Cancel
Save