# 安装 jar 包

HttpClient 依赖,推荐安装 4.5.14 版本

网址:https://hc.apache.org/downloads.cgi

image-20240228111046888

# 配置 jar 包

1、idea 新建一个空白的 Java 项目

2、在该项目目录下新建一个空目录名称为‘lib’

3、解压下载的 jar 包

4、给项目配置 jar 包依赖

image-20240228113147619

image-20240228113928011

image-20240228114102945

image-20240228114206680

5、将下载并解压后 jar 中 lib 文件下的文件复制到在项目文件创建的 lib 文件下(即 demo/lib/)

# 使用

# GET 请求
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
public class DemoRequest {
    public static void main(String[] args) throws IOException {
        // 1. 创建一个 httpClient 实例
        CloseableHttpClient httpClient = HttpClients.createDefault();
        // 2. 创建 GET 请求方法实例
        HttpGet httpGet = new HttpGet("https://www.baidu.com");
        // 3. 调用 HttpClient 实例执行 GET 请求方法,获取 response
        CloseableHttpResponse response = httpClient.execute(httpGet);
        // 4. 读 response 响应
        int status = response.getStatusLine().getStatusCode();
        System.out.println("status: " + status);
        // 5. 解析响应内容
        if (status == 200) {
            HttpEntity entity = response.getEntity();
            System.out.println("==================");
            String html = EntityUtils.toString(entity, "UTF-8");
            System.out.println(html);
        } else {
            throw new ClientProtocolException("==>请求失败<==");
        }
    }
}
# 重写响应

在 Java 中, ResponseHandler 是 HttpClient 中的一个接口,用于处理 HTTP 请求响应(我们可以对它进行重写)

public static void getTest2() throws IOException {
    	CloseableHttpClient httpClient = HttpClients.createDefault();
        HttpGet httpGet = new HttpGet("https://www.baidu.com");
        // 重写响应
        ResponseHandler<String> responseHandle = new ResponseHandler<String>(){
            @Override
            public String handleResponse(HttpResponse httpResponse) throws ClientProtocolException, IOException {
                int status = httpResponse.getStatusLine().getStatusCode();
                if (status == 200) {
                    HttpEntity entity = httpResponse.getEntity();
                    return EntityUtils.toString(entity, "UTF-8");
                } else {
                    throw new IOException("==>请求失败<==");
                }
            }
        };
        try {
            String responseBody = httpClient.execute(httpGet, responseHandle);
            System.out.println(responseBody);
        } catch (IOException e){
            e.printStackTrace();
        }
    }
# 请求参数携带
public static void getTestParams() throws IOException {
    	CloseableHttpClient httpClient = HttpClients.createDefault();
        String baseUrl = "https://wzzdg.sun0769.com/political/index/politicsNewest";
        try {
            URIBuilder uriBuilder = new URIBuilder(baseUrl);
            uriBuilder.setParameter("page", "1")
                    .setParameter("id", "1");
            URI url = uriBuilder.build();
            HttpGet httpGet = new HttpGet(url);
            // 重写响应
            ResponseHandler<String> responseHandle = new ResponseHandler<String>(){
                @Override
                public String handleResponse(HttpResponse httpResponse) throws ClientProtocolException, IOException {
                    int status = httpResponse.getStatusLine().getStatusCode();
                    if (status == 200) {
                        HttpEntity entity = httpResponse.getEntity();
                        return EntityUtils.toString(entity, "UTF-8");
                    } else {
                        throw new IOException("==>请求失败<==");
                    }
                }
            };
            try {
                String responseBody = httpClient.execute(httpGet, responseHandle);
                System.out.println(responseBody);
            } catch (IOException e){
                e.printStackTrace();
            }
        } catch (URISyntaxException e) {
            e.printStackTrace();
        }
    }
# POST 请求
# 请求参数为字符串
public static void postTest() throws IOException {
        HttpPost httpPost = new HttpPost("http://httpbin.org/post");
        httpPost.setEntity(new StringEntity("this is Post"));  // 传递请求参数,这里的请求参数是一个字符串
        CloseableHttpResponse response = httpClient.execute(httpPost);
        int status = response.getStatusLine().getStatusCode();
        if (status == 200) {
            HttpEntity entity = response.getEntity();
            String html = EntityUtils.toString(entity, "UTF-8");
            System.out.println(html);
        } else {
            throw new ClientProtocolException("==>请求失败<==");
        }
    }
# 请求参数为表单
public static void postTestForm() throws IOException {
        List<NameValuePair> params = new ArrayList<>();
        params.add(new BasicNameValuePair("column", "szse_gem_latest"));
        params.add(new BasicNameValuePair("pageNum", "1"));
        params.add(new BasicNameValuePair("pageSize", "30"));
        params.add(new BasicNameValuePair("sortName", ""));
        params.add(new BasicNameValuePair("sortType", ""));
        params.add(new BasicNameValuePair("clusterFlag", "true"));
        HttpPost httpPost = new HttpPost("http://www.cninfo.com.cn/new/disclosure");
        httpPost.setHeader("Content-Type", "application/x-www-form-urlencoded;");
        httpPost.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36");
        httpPost.setEntity(new UrlEncodedFormEntity(params, Consts.UTF_8));
        CloseableHttpResponse response = httpClient.execute(httpPost);
        int status = response.getStatusLine().getStatusCode();
        if (status == 200) {
            HttpEntity entity = response.getEntity();
            String res = EntityUtils.toString(entity);
            System.out.println(res);
        } else {
            throw new ClientProtocolException("==>请求失败<==");
        }
        response.close();
        httpClient.close();
    }
# 请求参数为 json
public static void postTestJson() throws IOException {
        HttpPost httpPost = new HttpPost("http://httpbin.org/post");
        String json = "{\"name\":\"sun0769\",\"age\":18}";
        StringEntity stringEntity = new StringEntity(json, Consts.UTF_8);
        httpPost.setEntity(stringEntity);
        CloseableHttpResponse response = httpClient.execute(httpPost);
        int status = response.getStatusLine().getStatusCode();
        if (status == 200) {
            HttpEntity entity = response.getEntity();
            String html = EntityUtils.toString(entity, "UTF-8");
            System.out.println(html);
        } else {
            throw new ClientProtocolException();
        }
    }

# 数据解析

# jsoup 依赖下载

下载网址:https://jsoup.org/download

下载好后,同第二步一样配置好 jar 包

# 使用
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.File;
import java.io.IOException;
public static void domTest(){
        try {
            File file = new File("D:\\Pyt_Pyc_Java_Hadoop_Ws_Django\\IDEA\\Java爬虫\\demo\\src\\index.html");
            Document doc = Jsoup.parse(file, "UTF-8");
            Elements lis = doc.select("ul.title-state-ul li");
            for (Element li : lis) {
                String text = li.text();
                System.out.println(text);
            }
            Element li_f = doc.select("ul.title-state-ul li").first();
            System.out.println(li_f.text());
            System.out.println("===属性提取===");
            Elements select = doc.select("ul.title-state-ul>li");
            Element element = select.get(0);
            Element first = element.select("span.state3").first();
            Element a = first.select("a").first();
            System.out.println(a.text());
            System.out.println(a.attr("href"));  // 获取单个属性
            System.out.println(a.attributes());  // 获取全部属性
            // 用属性作为条件筛选
            Elements select1 = doc.select("a[href^=/political]");
            System.out.println(select1);
            System.out.println("===模糊匹配===");
            Elements select2 = doc.select("span[class~=state]");
            System.out.println(select2);
            System.out.println("========遍历(Traversal)API==========");
            Element parent = first.parent();  // 获取父节点
            Elements children = first.children();  // 获取子节点
            Element brother_n = first.nextElementSibling();  // 获取后兄弟节点
            Element brother_p = first.previousElementSibling();  // 获取前兄弟节点
            Element brother_f = first.firstElementSibling();  // 获取第一个兄弟节点
            Elements brothers = first.siblingElements();  // 获取所有兄弟节点
            System.out.println(brothers);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

# 连接 mysql 数据库并插入数据

# mysql connect 依赖下载

下载地址:https://mvnrepository.com/artifact/mysql/mysql-connector-java

根据自己的 mysql 服务下载对应的版本,同第二步配置 jar 包

# 连接 MySQL 数据库并插入数据
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.SQLException;
public static void dbSave(){
        String url = "jdbc:mysql://192.168.10.129:3306/spider";
        String user = "root";
        String pwd = "123456";
        try {
            Connection conn = DriverManager.getConnection(url, user, pwd);
            System.out.println("数据库连接成功!");
            
            String sql = "insert into data_java(id,title,content,dtime) values(?,?,?,?)";            
            PreparedStatement statement = conn.prepareStatement(sql);
            statement.setString(1, "1");
            statement.setString(2, "噪音极大");
            statement.setString(3, "等待处理:17天8小时14分");
            statement.setString(4, "2024-02-29 15:21:24");
            statement.executeUpdate();
            System.out.println("插入成功!");
            
            statement.close();
        } catch (SQLException e) {
            throw new RuntimeException(e);
        }
    }

# 多线程爬虫

# 线程池
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
public class MultiThreadExample {
    public static void main(String[] args) {
        // 创建固定大小的线程池,包括 5 个线程
        ExecutorService executorService = Executors.newFixedThreadPool(5);
        long start = System.currentTimeMillis();
        // 提交 100 个任务给线程池
        for (int i=0; i<100; i++) {
            final int page = i;
            executorService.submit(() -> job(page));
        }
        long end = System.currentTimeMillis();
        System.out.println("任务执行完毕,耗时:" + (end - start) + "ms");
        // 关闭线程池
        executorService.shutdown();
    }
    public static void job(int page){
        System.out.println("page:"+page);
    }
}
# 完整 demo

目标网站:https://wzzdg.sun0769.com/political/index/politicsNewest?id=1&page=1

import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
public class Demo {
    private static String DB_USERNAME = "root";
    private static String DB_PASSWORD = "123456";
    private static String DB_URL = "jdbc:mysql://192.168.10.129:3306/spider";
    private static String BASE_URL = "https://wzzdg.sun0769.com/political/index/politicsNewest";
    public static void main(String[] args) {
        ExecutorService executorService = Executors.newFixedThreadPool(5);
        for (int i=1; i<=20; i++) {
            final int page = i;
            CompletableFuture.runAsync(() -> {
                try {
                    String url = BASE_URL + "?id=1&page=" + page;
                    // 获取数据
                    String content = crawl(url);
                    // 处理数据
                    process(content);
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }, executorService);
        }
        executorService.shutdown();
    }
    public static String crawl(String url) {
        CloseableHttpClient httpClient = HttpClients.createDefault();
        HttpGet httpGet = new HttpGet(url);
        try {
            CloseableHttpResponse response = httpClient.execute(httpGet);
            int status = response.getStatusLine().getStatusCode();
            if (status >= 200 && status < 300) {
                HttpEntity entity = response.getEntity();
                return EntityUtils.toString(entity, "UTF-8");
            } else {
                throw new ClientProtocolException("==>请求失败<==");
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }
    public static void process(String content) {
        try(Connection conn = DriverManager.getConnection(DB_URL, DB_USERNAME, DB_PASSWORD)){
            Document doc = Jsoup.parse(content, "UTF-8");
            Elements lis = doc.select("ul.title-state-ul li");
            String sql = "insert into data_java(id,status,title,massage,dtime) values(?,?,?,?,?)";
            PreparedStatement statement = conn.prepareStatement(sql);
            for (Element li : lis) {
                String id = li.select("span.state1").text();
                String status = li.select("span.state2").text();
                String title = li.select("span.state3>a").text();
                String massage = li.select("span.state4").text();
                String dtime = li.select("span.state5").text();
                statement.setString(1, id);
                statement.setString(2, status);
                statement.setString(3, title);
                statement.setString(4, massage);
                statement.setString(5, dtime);
                statement.executeUpdate();
                System.out.println("id:"+id+", title:"+title+" --> 保存成功");
            }
            statement.close();
        } catch (SQLException e) {
            e.printStackTrace();
        }
    }
}