一、网络爬虫
爬虫的基本原理很简单,就是利用程序访问互联网,然后将数据保存到本地中。我们都知道,互联网提供的服务大多数是以网站的形式提供的。我们需要的数据一般都是从网站中获取的,如电商网站商品信息、商品的评论、微博的信息等。爬虫和我们手动将看到的数据复制粘贴下来是类似的,只是获取大量的数据靠人工显然不太可能。因此,需要我们使用工具来帮助获取知识。使用程序编写爬虫就是使用程序编写一些网络访问的规则,将我们的目标数据保存下来。接下来,让我们开始从头搭建一个爬虫的案例。
二、HttpClient的入门程序
maven依赖:
<dependency><groupId>org.apache.httpcomponents</groupId><artifactId>httpclient</artifactId><version>4.5.2</version></dependency><dependency><groupId>org.slf4j</groupId><artifactId>slf4j-log4j12</artifactId><version>1.7.25</version><scope>test</scope></dependency>
HelloWolrd代码
/*** @author River* @date 2020/6/23 6:21* @description*/public class HelloWorld {public static void main(String[] args) throws Exception{//1.打开浏览器,创建对象CloseableHttpClient httpClient = HttpClients.createDefault();//2.网址HttpGet httpGet = new HttpGet("http://www.itcast.cn");//3.发起请求CloseableHttpResponse response = httpClient.execute(httpGet);//4.解析响应,获取数据if(response.getStatusLine().getStatusCode()==200){HttpEntity entity = response.getEntity();String content = EntityUtils.toString(entity, "utf8");System.out.println(content);}}}
返回的数据首页的数据:
三、HttpClient Get请求
public static void main(String[] args) throws Exception{//1.创建对象CloseableHttpClient httpClient = HttpClients.createDefault();//2.访问地址HttpGet httpGet = new HttpGet("http://www.itcast.cn");//3.发起请CloseableHttpResponse response = httpClient.execute(httpGet);//4.解析响应if(response.getStatusLine().getStatusCode()==200){HttpEntity entity = response.getEntity();String content = EntityUtils.toString(entity, "utf8");System.out.println(content);}//5.关闭responseresponse.close();httpClient.close();}
带参数的Get请求:使用的是uri进行参数的构造
public static void main(String[] args) throws Exception{//1.创建对象CloseableHttpClient httpClient = HttpClients.createDefault();//设置请求地址URIBuilder uriBuilder = new URIBuilder("http://yun.itheima.com/search");uriBuilder.setParameter("keys","java");//2.访问地址HttpGet httpGet = new HttpGet(uriBuilder.build());//3.发起请求CloseableHttpResponse response = httpClient.execute(httpGet);//4.解析响应if(response.getStatusLine().getStatusCode()==200){HttpEntity entity = response.getEntity();String content = EntityUtils.toString(entity, "utf8");System.out.println(content);}//5.关闭responseresponse.close();httpClient.close();}
四、Post请求
public static void main(String[] args) throws Exception{//1.创建对象CloseableHttpClient httpClient = HttpClients.createDefault();//2.访问地址HttpPost httpPost = new HttpPost("http://www.itcast.cn");//3.发起请求CloseableHttpResponse response = httpClient.execute(httpPost);//4.解析响应if(response.getStatusLine().getStatusCode()==200){HttpEntity entity = response.getEntity();String content = EntityUtils.toString(entity, "utf8");System.out.println(content);}//5.关闭responseresponse.close();httpClient.close();}
带参数的POST:
//1.创建对象CloseableHttpClient httpClient = HttpClients.createDefault();//2.访问地址HttpPost httpPost = new HttpPost("http://www.itcast.cn/search");// 利用集合封装表单请求参数List<NameValuePair> params = new ArrayList<NameValuePair>();params.add(new BasicNameValuePair("keys","java"));UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(params,"utf8");httpPost.setEntity(formEntity);//3.发起请求CloseableHttpResponse response = httpClient.execute(httpPost);//4.解析响应if(response.getStatusLine().getStatusCode()==200){HttpEntity entity = response.getEntity();String content = EntityUtils.toString(entity, "utf8");System.out.println(content);}//5.关闭responseresponse.close();httpClient.close();}
HttpClient连接池
public static void main(String[] args) throws Exception {//1.创建连接池管理器PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();//设置连接数cm.setMaxTotal(100);//设置每个主机的最大连接数cm.setDefaultMaxPerRoute(10);//2.管理器发起请求doGet(cm);}private static void doGet(PoolingHttpClientConnectionManager cm) throws Exception{//1.从连接池中获取HttpClientCloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();//2.发请求HttpGet httpGet = new HttpGet("http://www.itcast.cn");CloseableHttpResponse response = httpClient.execute(httpGet);if (response.getStatusLine().getStatusCode()==200){String content = EntityUtils.toString(response.getEntity(), "utf8");System.out.println(content.length());}//3.关闭response.close();//httpClient.close(); 连接池管理 不必关闭}
HttpClient请求参数
public static void main(String[] args) throws Exception {//1.创建对象CloseableHttpClient httpClient = HttpClients.createDefault();//2.访问地址HttpGet httpGet = new HttpGet("http://www.itcast.cn");// 配置请求信息RequestConfig config = RequestConfig.custom().setConnectTimeout(1000) //创建连接的做大连接时间 一天.setConnectionRequestTimeout(500) //获取连接的最长时间.setSocketTimeout(10*1000) //设置数据传输的最长时间.build();httpGet.setConfig(config);//3.发起请求CloseableHttpResponse response = httpClient.execute(httpGet);//4.解析响应if(response.getStatusLine().getStatusCode()==200){HttpEntity entity = response.getEntity();String content = EntityUtils.toString(entity, "utf8");System.out.println(content);}//5.关闭responseresponse.close();httpClient.close();}
封装
import com.sun.istack.Pool;import org.apache.http.client.config.RequestConfig;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.CloseableHttpClient;import org.apache.http.impl.client.HttpClients;import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;import org.apache.http.util.EntityUtils;import org.springframework.stereotype.Component;import java.io.*;import java.util.UUID;/*** @date 2020/2/8 12:43*/@Component //实例才能使用工具类public class HttpUtils {private PoolingHttpClientConnectionManager cm;public HttpUtils() {cm = new PoolingHttpClientConnectionManager();//设置最大连接数cm.setMaxTotal(100);cm.setDefaultMaxPerRoute(10);}/*** 根据请求地址下载数据* @param url* @return* @throws Exception*/public String doGetHtml(String url) throws Exception {//1.获取对象CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();//2.地址HttpGet httpGet = new HttpGet(url);//设置请求信息httpGet.setConfig(getConfig());//3.请求数据CloseableHttpResponse response = httpClient.execute(httpGet);//4.解析if (response.getStatusLine().getStatusCode()==200){if (response.getEntity() != null){String content = EntityUtils.toString(response.getEntity(), "utf8");return content;}}if (response != null){response.close();}return ""; //没有数据时候返回空}/*** 下载图片* @param url* @return*/public String doGetImage(String url) throws Exception {//1.获取对象CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();//2.地址HttpGet httpGet = new HttpGet(url);//设置请求信息httpGet.setConfig(getConfig());//3.请求数据CloseableHttpResponse response = httpClient.execute(httpGet);//4.解析if (response.getStatusLine().getStatusCode()==200){if (response.getEntity() != null){//获取图片的后缀String extName = url.substring(url.lastIndexOf("."));//重命名图片String picName = UUID.randomUUID().toString()+extName;//下载图片OutputStream outputStream = new FileOutputStream(new File("D:\\APP\\IDEA\\workplace\\crawler\\images\\"+picName));response.getEntity().writeTo(outputStream);//返回图片名称return picName;}}if (response != null){response.close();}return ""; //没有数据时候返回空}private RequestConfig getConfig() {RequestConfig config = RequestConfig.custom().setConnectTimeout(1000) //创建连接的最长时间.setConnectionRequestTimeout(500) //获取连接的最长时间.setSocketTimeout(10000) //数据传输的最长时间.build();return config;}}
