# 安装 jar 包
HttpClient 依赖,推荐安装 4.5.14 版本
网址:https://hc.apache.org/downloads.cgi
# 配置 jar 包
1、idea 新建一个空白的 Java 项目
2、在该项目目录下新建一个空目录名称为‘lib’
3、解压下载的 jar 包
4、给项目配置 jar 包依赖
5、将下载并解压后 jar 中 lib 文件下的文件复制到在项目文件创建的 lib 文件下(即 demo/lib/)
# 使用
# GET 请求
import org.apache.http.HttpEntity; | |
import org.apache.http.client.ClientProtocolException; | |
import org.apache.http.client.methods.CloseableHttpResponse; | |
import org.apache.http.client.methods.HttpGet; | |
import org.apache.http.impl.client.CloseableHttpClient; | |
import org.apache.http.impl.client.HttpClients; | |
import org.apache.http.util.EntityUtils; | |
import java.io.IOException; | |
public class DemoRequest { | |
public static void main(String[] args) throws IOException { | |
// 1. 创建一个 httpClient 实例 | |
CloseableHttpClient httpClient = HttpClients.createDefault(); | |
// 2. 创建 GET 请求方法实例 | |
HttpGet httpGet = new HttpGet("https://www.baidu.com"); | |
// 3. 调用 HttpClient 实例执行 GET 请求方法,获取 response | |
CloseableHttpResponse response = httpClient.execute(httpGet); | |
// 4. 读 response 响应 | |
int status = response.getStatusLine().getStatusCode(); | |
System.out.println("status: " + status); | |
// 5. 解析响应内容 | |
if (status == 200) { | |
HttpEntity entity = response.getEntity(); | |
System.out.println("=================="); | |
String html = EntityUtils.toString(entity, "UTF-8"); | |
System.out.println(html); | |
} else { | |
throw new ClientProtocolException("==>请求失败<=="); | |
} | |
} | |
} |
# 重写响应
在 Java 中, ResponseHandler
是 HttpClient 中的一个接口,用于处理 HTTP 请求响应(我们可以对它进行重写)
public static void getTest2() throws IOException { | |
CloseableHttpClient httpClient = HttpClients.createDefault(); | |
HttpGet httpGet = new HttpGet("https://www.baidu.com"); | |
// 重写响应 | |
ResponseHandler<String> responseHandle = new ResponseHandler<String>(){ | |
@Override | |
public String handleResponse(HttpResponse httpResponse) throws ClientProtocolException, IOException { | |
int status = httpResponse.getStatusLine().getStatusCode(); | |
if (status == 200) { | |
HttpEntity entity = httpResponse.getEntity(); | |
return EntityUtils.toString(entity, "UTF-8"); | |
} else { | |
throw new IOException("==>请求失败<=="); | |
} | |
} | |
}; | |
try { | |
String responseBody = httpClient.execute(httpGet, responseHandle); | |
System.out.println(responseBody); | |
} catch (IOException e){ | |
e.printStackTrace(); | |
} | |
} |
# 请求参数携带
public static void getTestParams() throws IOException { | |
CloseableHttpClient httpClient = HttpClients.createDefault(); | |
String baseUrl = "https://wzzdg.sun0769.com/political/index/politicsNewest"; | |
try { | |
URIBuilder uriBuilder = new URIBuilder(baseUrl); | |
uriBuilder.setParameter("page", "1") | |
.setParameter("id", "1"); | |
URI url = uriBuilder.build(); | |
HttpGet httpGet = new HttpGet(url); | |
// 重写响应 | |
ResponseHandler<String> responseHandle = new ResponseHandler<String>(){ | |
@Override | |
public String handleResponse(HttpResponse httpResponse) throws ClientProtocolException, IOException { | |
int status = httpResponse.getStatusLine().getStatusCode(); | |
if (status == 200) { | |
HttpEntity entity = httpResponse.getEntity(); | |
return EntityUtils.toString(entity, "UTF-8"); | |
} else { | |
throw new IOException("==>请求失败<=="); | |
} | |
} | |
}; | |
try { | |
String responseBody = httpClient.execute(httpGet, responseHandle); | |
System.out.println(responseBody); | |
} catch (IOException e){ | |
e.printStackTrace(); | |
} | |
} catch (URISyntaxException e) { | |
e.printStackTrace(); | |
} | |
} |
# POST 请求
# 请求参数为字符串
public static void postTest() throws IOException { | |
HttpPost httpPost = new HttpPost("http://httpbin.org/post"); | |
httpPost.setEntity(new StringEntity("this is Post")); // 传递请求参数,这里的请求参数是一个字符串 | |
CloseableHttpResponse response = httpClient.execute(httpPost); | |
int status = response.getStatusLine().getStatusCode(); | |
if (status == 200) { | |
HttpEntity entity = response.getEntity(); | |
String html = EntityUtils.toString(entity, "UTF-8"); | |
System.out.println(html); | |
} else { | |
throw new ClientProtocolException("==>请求失败<=="); | |
} | |
} |
# 请求参数为表单
public static void postTestForm() throws IOException { | |
List<NameValuePair> params = new ArrayList<>(); | |
params.add(new BasicNameValuePair("column", "szse_gem_latest")); | |
params.add(new BasicNameValuePair("pageNum", "1")); | |
params.add(new BasicNameValuePair("pageSize", "30")); | |
params.add(new BasicNameValuePair("sortName", "")); | |
params.add(new BasicNameValuePair("sortType", "")); | |
params.add(new BasicNameValuePair("clusterFlag", "true")); | |
HttpPost httpPost = new HttpPost("http://www.cninfo.com.cn/new/disclosure"); | |
httpPost.setHeader("Content-Type", "application/x-www-form-urlencoded;"); | |
httpPost.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"); | |
httpPost.setEntity(new UrlEncodedFormEntity(params, Consts.UTF_8)); | |
CloseableHttpResponse response = httpClient.execute(httpPost); | |
int status = response.getStatusLine().getStatusCode(); | |
if (status == 200) { | |
HttpEntity entity = response.getEntity(); | |
String res = EntityUtils.toString(entity); | |
System.out.println(res); | |
} else { | |
throw new ClientProtocolException("==>请求失败<=="); | |
} | |
response.close(); | |
httpClient.close(); | |
} |
# 请求参数为 json
public static void postTestJson() throws IOException { | |
HttpPost httpPost = new HttpPost("http://httpbin.org/post"); | |
String json = "{\"name\":\"sun0769\",\"age\":18}"; | |
StringEntity stringEntity = new StringEntity(json, Consts.UTF_8); | |
httpPost.setEntity(stringEntity); | |
CloseableHttpResponse response = httpClient.execute(httpPost); | |
int status = response.getStatusLine().getStatusCode(); | |
if (status == 200) { | |
HttpEntity entity = response.getEntity(); | |
String html = EntityUtils.toString(entity, "UTF-8"); | |
System.out.println(html); | |
} else { | |
throw new ClientProtocolException(); | |
} | |
} |
# 数据解析
# jsoup 依赖下载
下载网址:https://jsoup.org/download
下载好后,同第二步一样配置好 jar 包
# 使用
import org.jsoup.Jsoup; | |
import org.jsoup.nodes.Document; | |
import org.jsoup.nodes.Element; | |
import org.jsoup.select.Elements; | |
import java.io.File; | |
import java.io.IOException; | |
public static void domTest(){ | |
try { | |
File file = new File("D:\\Pyt_Pyc_Java_Hadoop_Ws_Django\\IDEA\\Java爬虫\\demo\\src\\index.html"); | |
Document doc = Jsoup.parse(file, "UTF-8"); | |
Elements lis = doc.select("ul.title-state-ul li"); | |
for (Element li : lis) { | |
String text = li.text(); | |
System.out.println(text); | |
} | |
Element li_f = doc.select("ul.title-state-ul li").first(); | |
System.out.println(li_f.text()); | |
System.out.println("===属性提取==="); | |
Elements select = doc.select("ul.title-state-ul>li"); | |
Element element = select.get(0); | |
Element first = element.select("span.state3").first(); | |
Element a = first.select("a").first(); | |
System.out.println(a.text()); | |
System.out.println(a.attr("href")); // 获取单个属性 | |
System.out.println(a.attributes()); // 获取全部属性 | |
// 用属性作为条件筛选 | |
Elements select1 = doc.select("a[href^=/political]"); | |
System.out.println(select1); | |
System.out.println("===模糊匹配==="); | |
Elements select2 = doc.select("span[class~=state]"); | |
System.out.println(select2); | |
System.out.println("========遍历(Traversal)API=========="); | |
Element parent = first.parent(); // 获取父节点 | |
Elements children = first.children(); // 获取子节点 | |
Element brother_n = first.nextElementSibling(); // 获取后兄弟节点 | |
Element brother_p = first.previousElementSibling(); // 获取前兄弟节点 | |
Element brother_f = first.firstElementSibling(); // 获取第一个兄弟节点 | |
Elements brothers = first.siblingElements(); // 获取所有兄弟节点 | |
System.out.println(brothers); | |
} catch (IOException e) { | |
e.printStackTrace(); | |
} | |
} |
# 连接 mysql 数据库并插入数据
# mysql connect 依赖下载
下载地址:https://mvnrepository.com/artifact/mysql/mysql-connector-java
根据自己的 mysql 服务下载对应的版本,同第二步配置 jar 包
# 连接 MySQL 数据库并插入数据
import java.sql.Connection; | |
import java.sql.DriverManager; | |
import java.sql.PreparedStatement; | |
import java.sql.SQLException; | |
public static void dbSave(){ | |
String url = "jdbc:mysql://192.168.10.129:3306/spider"; | |
String user = "root"; | |
String pwd = "123456"; | |
try { | |
Connection conn = DriverManager.getConnection(url, user, pwd); | |
System.out.println("数据库连接成功!"); | |
String sql = "insert into data_java(id,title,content,dtime) values(?,?,?,?)"; | |
PreparedStatement statement = conn.prepareStatement(sql); | |
statement.setString(1, "1"); | |
statement.setString(2, "噪音极大"); | |
statement.setString(3, "等待处理:17天8小时14分"); | |
statement.setString(4, "2024-02-29 15:21:24"); | |
statement.executeUpdate(); | |
System.out.println("插入成功!"); | |
statement.close(); | |
} catch (SQLException e) { | |
throw new RuntimeException(e); | |
} | |
} |
# 多线程爬虫
# 线程池
import java.util.concurrent.ExecutorService; | |
import java.util.concurrent.Executors; | |
public class MultiThreadExample { | |
public static void main(String[] args) { | |
// 创建固定大小的线程池,包括 5 个线程 | |
ExecutorService executorService = Executors.newFixedThreadPool(5); | |
long start = System.currentTimeMillis(); | |
// 提交 100 个任务给线程池 | |
for (int i=0; i<100; i++) { | |
final int page = i; | |
executorService.submit(() -> job(page)); | |
} | |
long end = System.currentTimeMillis(); | |
System.out.println("任务执行完毕,耗时:" + (end - start) + "ms"); | |
// 关闭线程池 | |
executorService.shutdown(); | |
} | |
public static void job(int page){ | |
System.out.println("page:"+page); | |
} | |
} |
# 完整 demo
目标网站:https://wzzdg.sun0769.com/political/index/politicsNewest?id=1&page=1
import org.apache.http.HttpEntity; | |
import org.apache.http.client.ClientProtocolException; | |
import org.apache.http.client.methods.CloseableHttpResponse; | |
import org.apache.http.client.methods.HttpGet; | |
import org.apache.http.impl.client.CloseableHttpClient; | |
import org.apache.http.impl.client.HttpClients; | |
import org.apache.http.util.EntityUtils; | |
import org.jsoup.Jsoup; | |
import org.jsoup.nodes.Document; | |
import org.jsoup.nodes.Element; | |
import org.jsoup.select.Elements; | |
import java.io.IOException; | |
import java.sql.Connection; | |
import java.sql.DriverManager; | |
import java.sql.PreparedStatement; | |
import java.sql.SQLException; | |
import java.util.concurrent.CompletableFuture; | |
import java.util.concurrent.ExecutorService; | |
import java.util.concurrent.Executors; | |
public class Demo { | |
private static String DB_USERNAME = "root"; | |
private static String DB_PASSWORD = "123456"; | |
private static String DB_URL = "jdbc:mysql://192.168.10.129:3306/spider"; | |
private static String BASE_URL = "https://wzzdg.sun0769.com/political/index/politicsNewest"; | |
public static void main(String[] args) { | |
ExecutorService executorService = Executors.newFixedThreadPool(5); | |
for (int i=1; i<=20; i++) { | |
final int page = i; | |
CompletableFuture.runAsync(() -> { | |
try { | |
String url = BASE_URL + "?id=1&page=" + page; | |
// 获取数据 | |
String content = crawl(url); | |
// 处理数据 | |
process(content); | |
} catch (Exception e) { | |
e.printStackTrace(); | |
} | |
}, executorService); | |
} | |
executorService.shutdown(); | |
} | |
public static String crawl(String url) { | |
CloseableHttpClient httpClient = HttpClients.createDefault(); | |
HttpGet httpGet = new HttpGet(url); | |
try { | |
CloseableHttpResponse response = httpClient.execute(httpGet); | |
int status = response.getStatusLine().getStatusCode(); | |
if (status >= 200 && status < 300) { | |
HttpEntity entity = response.getEntity(); | |
return EntityUtils.toString(entity, "UTF-8"); | |
} else { | |
throw new ClientProtocolException("==>请求失败<=="); | |
} | |
} catch (IOException e) { | |
throw new RuntimeException(e); | |
} | |
} | |
public static void process(String content) { | |
try(Connection conn = DriverManager.getConnection(DB_URL, DB_USERNAME, DB_PASSWORD)){ | |
Document doc = Jsoup.parse(content, "UTF-8"); | |
Elements lis = doc.select("ul.title-state-ul li"); | |
String sql = "insert into data_java(id,status,title,massage,dtime) values(?,?,?,?,?)"; | |
PreparedStatement statement = conn.prepareStatement(sql); | |
for (Element li : lis) { | |
String id = li.select("span.state1").text(); | |
String status = li.select("span.state2").text(); | |
String title = li.select("span.state3>a").text(); | |
String massage = li.select("span.state4").text(); | |
String dtime = li.select("span.state5").text(); | |
statement.setString(1, id); | |
statement.setString(2, status); | |
statement.setString(3, title); | |
statement.setString(4, massage); | |
statement.setString(5, dtime); | |
statement.executeUpdate(); | |
System.out.println("id:"+id+", title:"+title+" --> 保存成功"); | |
} | |
statement.close(); | |
} catch (SQLException e) { | |
e.printStackTrace(); | |
} | |
} | |
} |