博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
java网络爬虫之二
阅读量:3745 次
发布时间:2019-05-22

本文共 7940 字,大约阅读时间需要 26 分钟。

在此爬虫程序中,采用伪造证书的方法对反爬网站进行应对,能有效地解决绝大部分网站因具有反爬功能而爬取不了的状况。

程序核心算法部分在思路上采用广度优先搜索的思路,能很好的解决一路扎到底然后获取很多没用的链接,最后效率大大降低的情况。

程序代码如下:

package javaCrawler2;import java.io.BufferedReader;import java.io.File;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.net.HttpURLConnection;import java.net.MalformedURLException;import java.net.URL;import java.net.URLConnection;import java.util.LinkedHashMap;import java.util.Map;import java.util.regex.Matcher;import java.util.regex.Pattern;import javax.net.ssl.HostnameVerifier;import javax.net.ssl.HttpsURLConnection;import javax.net.ssl.SSLSession;public class javaCrawler2 {
private static void trustAllHttpsCertificates() throws Exception { javax.net.ssl.TrustManager[] trustAllCerts = new javax.net.ssl.TrustManager[1]; javax.net.ssl.TrustManager tm = new miTM(); trustAllCerts[0] = tm; javax.net.ssl.SSLContext sc = javax.net.ssl.SSLContext.getInstance("SSL"); sc.init(null, trustAllCerts, null); javax.net.ssl.HttpsURLConnection.setDefaultSSLSocketFactory(sc.getSocketFactory()); } static class miTM implements javax.net.ssl.TrustManager, javax.net.ssl.X509TrustManager { public java.security.cert.X509Certificate[] getAcceptedIssuers() { return null; } public boolean isServerTrusted(java.security.cert.X509Certificate[] certs) { return true; } public boolean isClientTrusted(java.security.cert.X509Certificate[] certs) { return true; } public void checkServerTrusted(java.security.cert.X509Certificate[] certs, String authType) throws java.security.cert.CertificateException { return; } public void checkClientTrusted(java.security.cert.X509Certificate[] certs, String authType) throws java.security.cert.CertificateException { return; } } public static void main(String[] args) { HostnameVerifier hv = new HostnameVerifier() { public boolean verify(String urlHostName, SSLSession session) { System.out.println("Warning: URL Host: " + urlHostName + " vs. " + session.getPeerHost()); return true; } }; try { trustAllHttpsCertificates(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } HttpsURLConnection.setDefaultHostnameVerifier(hv); javaCrawler2 webCrawlerDemo = new javaCrawler2(); webCrawlerDemo.myPrint("http://blog.csdn.net/qq_29606255?viewmode=contents"); } public void myPrint(String baseUrl) { Map
oldMap = new LinkedHashMap
(); // 存储链接-是否被遍历 // 键值对 String oldLinkHost = ""; // host Pattern p = Pattern.compile("(https?://)?[^/\\s]*"); // 比如:http://www.zifangsky.cn Matcher m = p.matcher(baseUrl); if (m.find()) { oldLinkHost = m.group(); } oldMap.put(baseUrl, false); oldMap = crawlLinks(oldLinkHost, oldMap); for (Map.Entry
mapping : oldMap.entrySet()) { System.out.println("链接:" + mapping.getKey()); String Src = RegexString(mapping.getKey(), ".com//(.+)"); FileOutputStream out = null; try { out = new FileOutputStream(new File("C:\\Users\\yihong\\Desktop\\c.html"), true); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } if (Src != "Nothing") { String result = sendGet(Src); try { out.write(result.getBytes()); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } System.out.println(result); } else { String result2 = sendGet(mapping.getKey()); try { out.write(result2.getBytes()); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } System.out.println(result2); } } } /** * 抓取一个网站所有可以抓取的网页链接,在思路上使用了广度优先算法 对未遍历过的新链接不断发起GET请求,一直到遍历完整个集合都没能发现新的链接 * 则表示不能发现新的链接了,任务结束 * * @param oldLinkHost * 域名,如:http://www.zifangsky.cn * @param oldMap * 待遍历的链接集合 * * @return 返回所有抓取到的链接集合 */ private Map
crawlLinks(String oldLinkHost, Map
oldMap) { Map
newMap = new LinkedHashMap
(); String oldLink = ""; for (Map.Entry
mapping : oldMap.entrySet()) { System.out.println("link:" + mapping.getKey() + "--------check:" + mapping.getValue()); // 如果没有被遍历过 if (!mapping.getValue()) { oldLink = mapping.getKey(); // 发起GET请求 try { URL url = new URL(oldLink); HttpURLConnection connection = (HttpURLConnection) url.openConnection(); connection.setRequestMethod("GET"); connection.setConnectTimeout(2000); connection.setReadTimeout(2000); if (connection.getResponseCode() == 200) { InputStream inputStream = connection.getInputStream(); BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8")); // BufferedWriter bw = null; // bw = new BufferedWriter(new // FileWriter("C:\\Users\\yihong\\Desktop\\b.txt",true)); String line = ""; Pattern pattern = Pattern.compile("
(.+)"); Matcher matcher = null; while ((line = reader.readLine()) != null) { // bw.write(line);//输出字符串 // bw.newLine();//换行 // bw.flush(); matcher = pattern.matcher(line); if (matcher.find()) { String newLink = matcher.group(1).trim(); // 链接 // String title = matcher.group(3).trim(); //标题 // 判断获取到的链接是否以http开头 if (!newLink.startsWith("http")) { if (newLink.startsWith("/")) newLink = oldLinkHost + newLink; else newLink = oldLinkHost + "/" + newLink; } // 去除链接末尾的 / if (newLink.endsWith("/")) newLink = newLink.substring(0, newLink.length() - 1); // 去重,并且丢弃其他网站的链接 if (!oldMap.containsKey(newLink) && !newMap.containsKey(newLink) && newLink.startsWith(oldLinkHost)) { // System.out.println("temp2: " + newLink); newMap.put(newLink, false); } } } } } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } try { Thread.sleep(1000); } catch (InterruptedException e) { e.printStackTrace(); } oldMap.replace(oldLink, false, true); } } // 有新链接,继续遍历 if (!newMap.isEmpty()) { oldMap.putAll(newMap); oldMap.putAll(crawlLinks(oldLinkHost, oldMap)); // 由于Map的特性,不会导致出现重复的键值对 } return oldMap; } static String sendGet(String url) { // 定义一个字符串用来存储网页内容 String result = ""; // 定义一个缓冲字符输入流 BufferedReader in = null; try { // 将string转成url对象 URL realUrl = new URL(url); // 初始化一个链接到那个url的连接 URLConnection connection = realUrl.openConnection(); connection.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)"); // 开始实际的连接 connection.connect(); // 初始化 BufferedReader输入流来读取URL的响应 in = new BufferedReader(new InputStreamReader(connection.getInputStream())); // 用来临时存储抓取到的每一行的数据 String line; while ((line = in.readLine()) != null) { // 遍历抓取到的每一行并将其存储到result里面 result += line; } } catch (Exception e) { System.out.println("发送GET请求出现异常!" + e); e.printStackTrace(); } // 使用finally来关闭输入流 finally { try { if (in != null) { in.close(); } } catch (Exception e2) { e2.printStackTrace(); } } return result; } static String RegexString(String targetStr, String patternStr) { // 定义一个样式模板,此中使用正则表达式,括号中是要抓的内容 // 相当于埋好了陷阱匹配的地方就会掉下去 Pattern pattern = Pattern.compile(patternStr); // 定义一个matcher用来做匹配 Matcher matcher = pattern.matcher(targetStr); // 如果找到了 if (matcher.find()) { // 打印出结果 return matcher.group(1); } return "Nothing"; }}
你可能感兴趣的文章
spark的历史服务器配置
查看>>
spark的API操作
查看>>
SparkSql
查看>>
SparkRdd-scala版本
查看>>
spark常见算子
查看>>
scala符号初体验
查看>>
kafka生产者常用参数含义
查看>>
mysql编写函数
查看>>
面试笔试题之hql
查看>>
sql函数之cast()
查看>>
hql中substr函数截取字符串匹配
查看>>
mysql之指定ip、用户、数据库权限
查看>>
zookeeper的读和写数据流程(有图欧)
查看>>
bin/schematool -dbType mysql -initSchema HiveMetaException: Failed to get schema version.
查看>>
flink知识总结
查看>>
flink之检查点(checkpoint)和保存点(savepoint)的区别
查看>>
Linux系统编程---进程I/O
查看>>
spring学习知识补充
查看>>
杂文之生成随机字符串
查看>>
springBoot基础(一)
查看>>