欢迎来到尧图网

客户服务 关于我们

您的位置:首页 > 新闻 > 会展 > Java开发笔记Ⅱ(Jsoup爬虫)

Java开发笔记Ⅱ(Jsoup爬虫)

2024/10/26 16:28:33 来源:https://blog.csdn.net/qq_46039856/article/details/139474614  浏览:    关键词:Java开发笔记Ⅱ(Jsoup爬虫)

Jsoup 爬虫

Java 也能写爬虫!!!

Jsoup重要对象如下:

Document:文档对象,每个html页面都是一个Document对象

Element:元素对象,一个Document对象里有多个Element对象

Node:节点对象,用于存储数据,标签名称、属性都是节点对象

Jsoup的主要方法如下:

static Connection connect(String url) 创建URL连接

static Document parse(File in, String charsetName) 解析文件为 Document 对象

static Document parse(String html) 解析html代码为 Document 对象

(虽然上边是最主要的方法,但是下边这段代码中,是用 document对象 + css 选择器来获取的信息)

爬虫示例(豆瓣)

   /*** 通过访问接口获取代理IP*/public void initIPPool() {System.out.println("开始获取IP...");Process proc;try {// 这个代码之前是python改的,这里偷懒直接调用,这个文件贴在后边proc = Runtime.getRuntime().exec("python getIP.py");BufferedReader in = new BufferedReader(new InputStreamReader(proc.getInputStream()));String line = null;while ((line = in.readLine()) != null) {System.out.println(line);}in.close();proc.waitFor();} catch (Exception e) {System.out.println(e.toString());}System.out.println("成功获取代理IP");}/*** 从存储代理IP的文件获取代理IP*/public void loadIPPool() {File file = new File("ipPool.txt");List<String> list = new ArrayList<String>();synchronized (this) {BufferedReader reader = null;try {reader = new BufferedReader(new FileReader(file));String tempString = null;// 一次读入一行,直到读入null为文件结束while ((tempString = reader.readLine()) != null) {list.add(tempString);}reader.close();} catch (IOException e) {e.printStackTrace();} finally {if (reader != null) {try {reader.close();} catch (IOException e1) {System.out.println(e1.toString());}}}}System.out.println(list);myIPPool = list.toArray(new String[list.size()]);System.out.println("成功载入IP代理池");}public String crawlOnce(Integer start) {StringBuilder finalResult = new StringBuilder();Random random = new Random();// 请求地址String url ="http://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=&start=" +(start);HttpGet request = new HttpGet(url);String proxyIp = myIPPool[random.nextInt(myIPPool.length)];while (proxyIp.split(":").length != 2) {// 在代理ip池里随机获取一个ipproxyIp = myIPPool[random.nextInt(myIPPool.length)];}HttpHost proxy = new HttpHost(proxyIp.split(":")[0],Integer.parseInt(proxyIp.split(":")[1]));SSLContextBuilder builder = new SSLContextBuilder();// 全部信任 不做身份鉴定PoolingHttpClientConnectionManager cm = null;SSLConnectionSocketFactory sslsf = null;try {builder.loadTrustMaterial(null, new TrustStrategy() {@Overridepublic boolean isTrusted(X509Certificate[] x509Certificates, String s) throws CertificateException {return true;}});sslsf = new SSLConnectionSocketFactory(builder.build(), new String[]{"SSLv2Hello","SSLv3", "TLSv1", "TLSv1.2"}, null, NoopHostnameVerifier.INSTANCE);Registry<ConnectionSocketFactory> registry =RegistryBuilder.<ConnectionSocketFactory>create().register("http",new PlainConnectionSocketFactory()).register("https", sslsf).build();cm = new PoolingHttpClientConnectionManager(registry);cm.setMaxTotal(200);//max connection} catch (Exception e) {System.out.println(e.toString());return "";}//设置认证CredentialsProvider provider = new BasicCredentialsProvider();//第一个参数对应代理httpHost,第二个参数设置代理的用户名和密码,如果代理不需要用户名和密码,填空provider.setCredentials(new AuthScope(proxy), new UsernamePasswordCredentials("", ""));//实例化CloseableHttpClient对象CloseableHttpClient httpClient = HttpClients.custom().setSSLSocketFactory(sslsf).setConnectionManager(cm).setConnectionManagerShared(true).setDefaultCredentialsProvider(provider).build();RequestConfig config = RequestConfig.custom().setProxy(proxy).setConnectTimeout(CONNECTION_TIME_OUT).setConnectionRequestTimeout(CONNECTION_TIME_OUT).setSocketTimeout(CONNECTION_TIME_OUT).build();request.setConfig(config);//添加请求头request.addHeader("User-Agent", myUAPool[random.nextInt(myUAPool.length)]);request.addHeader("Cookie", myCookies[random.nextInt(myCookies.length)]);request.addHeader("Accept-Language", "zh-CN,zh;q=0.9");request.addHeader("Sec-Fetch-Mode", "cors");request.addHeader("Sec-Fetch-Site", "same-origin");HttpResponse response = null;BufferedReader rd = null;try {response = httpClient.execute(request);rd = new BufferedReader(new InputStreamReader(response.getEntity().getContent()));} catch (IOException e) {logError(start);return "";}String line = "";StringBuilder result = new StringBuilder();while (true) {try {line = rd.readLine();if (line == null) {break;}} catch (IOException e) {logError(start);break;}// 请求返回了html页面if (line.equals("") || line.charAt(0) == '<') {break;}result.append(line);}System.out.println((start) + "--result:" + result);JSONObject res = JSONObject.parseObject(String.valueOf(result));if (res == null || !res.containsKey("data")) {logError(start);return "";}JSONArray jsonArray = res.getJSONArray("data");for (int i = 0; i < jsonArray.size(); i++) {JSONObject jo = jsonArray.getJSONObject(i);// 通过详情链接爬取电影详情finalResult.append(crawlDetails(jo.getString("url")));}return finalResult.toString();}// 爬取详情private String crawlDetails(String url) {String result = "";Random random = new Random();try {String proxyIp = myIPPool[random.nextInt(myIPPool.length)];// myUAPool这里可以换几个浏览器把useragent手写在变量里Connection con = Jsoup.connect(url).proxy(proxyIp.split(":")[0], Integer.parseInt(proxyIp.split(":")[1])).timeout(10000).userAgent(myUAPool[random.nextInt(myUAPool.length)]).header("Accept-Language", "zh-CN,zh;q=0.9").header("Cookie", myCookies[random.nextInt(myCookies.length)]).timeout(CONNECTION_TIME_OUT); // 设置连接超时时间// 执行连接,获取页面Connection.Response response = con.execute();Document document = con.get();String info = document.select("#info").text();// IDresult += url.substring(33, url.length() - 1);// 标题result += "," + document.select("#content > h1 > span:nth-child(1)").text();// 年份result += "," + document.select("#content > h1 > span.year").text();// 导演result += "," + document.select("#info > span:nth-child(1) > span.attrs > a").text();// 编剧result += "," + document.select("#info > span:nth-child(3) > span.attrs").text();// 主演result += "," + document.select("#info > span.actor > span.attrs").text();// 类型result += "," + document.select("[property=v:genre]").text();// 产地result += "," + info.substring(info.indexOf("制片国家/地区: "), info.indexOf(" 语言:")).substring("制片国家/地区: ".length());// 语言if (info.contains(" 上映日期:")) {result += "," + info.substring(info.indexOf("语言: "), info.indexOf(" 上映日期:")).substring("语言: ".length());} else {result += "," + info.substring(info.indexOf("语言: ")).substring("语言: ".length());}// 片长result += "," + document.select("[property=v:genre]").attr("content");// 评分result += "," +document.select("#interest_sectl > div > div.rating_self.clearfix > strong").text();// 5result += "," + document.select("#interest_sectl > div.rating_wrap.clearbox > div" +".ratings-on-weight > div:nth-child(1) > span" +".rating_per").text();// 4result += "," + document.select("#interest_sectl > div.rating_wrap.clearbox > div" +".ratings-on-weight > div:nth-child(2) > span" +".rating_per").text();// 3result += "," + document.select("#interest_sectl > div.rating_wrap.clearbox > div" +".ratings-on-weight > div:nth-child(3) > span" +".rating_per").text();// 2result += "," + document.select("#interest_sectl > div.rating_wrap.clearbox > div" +".ratings-on-weight > div:nth-child(4) > span" +".rating_per").text();// 1result += "," + document.select("#interest_sectl > div.rating_wrap.clearbox > div" +".ratings-on-weight > div:nth-child(5) > span" +".rating_per").text();// 评分人数result += "," + document.select("[property=v:votes]").text();// 评论数result +="," + document.select("#comments-section > div.mod-hd > h2 > span > a").text();System.out.println(proxyIp + " " + result);} catch (IOException e) {System.out.println(e.toString());}return result + "\n";}

获取代理IP的代码

# coding=UTF-8import requests
import jsonclass FreeIP():def __init__(self):# 代理ip网站self.url = "http://proxylist.fatezero.org/proxy.list"self.headers = {"User-Agent": "这里改为浏览器的useragent"}def check_ip(self, ip_list):correct_ip = []for ip in ip_list:if len(correct_ip) > 10:  # 可以根据自己的需求进行更改或者注释掉breakip_port = "{}:{}".format(ip["host"], ip["port"])proxies = {'https': ip_port}try:# 如果请求该网址,返回的IP地址与代理IP一致,则认为代理成功response = requests.get('https://icanhazip.com/', proxies=proxies,timeout=3).text  # 可以更改timeout时间if response.strip() == ip["host"]:# print("可用的IP地址为:{}".format(ip_port))correct_ip.append(ip_port)except:# print("不可用的IP地址为:{}".format(ip_port))return correct_ipdef run(self):response = requests.get(url=self.url).content.decode()ip_list = []proxies_list = response.split('\n')for proxy_str in proxies_list:try:proxy = {}proxy_json = json.loads(proxy_str)if proxy_json["anonymity"] == "high_anonymous" and proxy_json["type"] == "https":host = proxy_json['host']port = proxy_json['port']proxy["host"] = hostproxy["port"] = portip_list.append(proxy)except:correct_ip = self.check_ip(ip_list)file_path = 'ipPool.txt'# 写入这个文件with open(file_path, mode='w', encoding='utf-8') as file_obj:for i in correct_ip:file_obj.write(i + "\n")if __name__ == '__main__':ip = FreeIP()ip.run()

版权声明:

本网仅为发布的内容提供存储空间,不对发表、转载的内容提供任何形式的保证。凡本网注明“来源:XXX网络”的作品,均转载自其它媒体,著作权归作者所有,商业转载请联系作者获得授权,非商业转载请注明出处。

我们尊重并感谢每一位作者,均已注明文章来源和作者。如因作品内容、版权或其它问题,请及时与我们联系,联系邮箱:809451989@qq.com,投稿邮箱:809451989@qq.com