|
|
|
@ -16,6 +16,7 @@ import java.io.IOException;
|
|
|
|
|
import java.io.InputStreamReader;
|
|
|
|
|
import java.net.HttpURLConnection;
|
|
|
|
|
import java.net.URL;
|
|
|
|
|
import java.util.ArrayList;
|
|
|
|
|
import java.util.List;
|
|
|
|
|
|
|
|
|
|
public class TestReadHtml {
|
|
|
|
@ -41,28 +42,29 @@ public class TestReadHtml {
|
|
|
|
|
* @return
|
|
|
|
|
* @throws IOException
|
|
|
|
|
*/
|
|
|
|
|
public static String getXq(String areaName) throws IOException {
|
|
|
|
|
String res = "";
|
|
|
|
|
public static List<String> getXq(String areaName) throws IOException {
|
|
|
|
|
List<String> list = new ArrayList<>();
|
|
|
|
|
String url = "https://baike.baidu.com/item/" + areaName + "?fromModule=lemma_search-box";
|
|
|
|
|
String htmlContent = getHTML(url);
|
|
|
|
|
|
|
|
|
|
// 从字符串解析HTML
|
|
|
|
|
Document doc = Jsoup.parse(htmlContent);
|
|
|
|
|
// 选择所有span标签
|
|
|
|
|
Elements dts = doc.select("dt");
|
|
|
|
|
|
|
|
|
|
// 遍历所有span标签
|
|
|
|
|
// 遍历所有dt标签
|
|
|
|
|
for (Element dt : dts) {
|
|
|
|
|
if (dt.text().equals("下辖地区")) {
|
|
|
|
|
System.out.println(dt.nextElementSibling().text());
|
|
|
|
|
// 使用正则表达式替换掉以[]包含的部分
|
|
|
|
|
String output = dt.nextElementSibling().text().replaceAll("\\[.*?\\]", "");
|
|
|
|
|
output = output.trim();
|
|
|
|
|
list.add(output);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (dt.text().replace(" ", "").equals("面积")) {
|
|
|
|
|
System.out.println(dt.nextElementSibling().text());
|
|
|
|
|
String output = dt.nextElementSibling().text().replaceAll("\\[.*?\\]", "");
|
|
|
|
|
output = output.replace("km²", "");
|
|
|
|
|
output = output.trim();
|
|
|
|
|
list.add(output);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return res;
|
|
|
|
|
return list;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public static void main(String[] args) throws NoApiKeyException, InputRequiredException, IOException, InterruptedException {
|
|
|
|
@ -73,7 +75,7 @@ public class TestReadHtml {
|
|
|
|
|
for (Record record : list) {
|
|
|
|
|
String areaName = record.getStr("full_name");
|
|
|
|
|
String cityName = record.getStr("city_name");
|
|
|
|
|
String res = getXq(areaName);
|
|
|
|
|
List<String> res = getXq(areaName);
|
|
|
|
|
System.out.println(cityName + "\t" + areaName + "\t" + res);
|
|
|
|
|
Thread.sleep(1000);
|
|
|
|
|
}
|
|
|
|
|