100行代码爬取JFinal官网文档

时不时会有断网开发的群友索要 

依赖很少

java

import cn.hutool.core.io.FileUtil;
import cn.hutool.core.io.IORuntimeException;
import cn.hutool.core.util.IdUtil;
import cn.hutool.core.util.StrUtil;
import cn.hutool.http.HttpUtil;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;

pom

<dependency>
    <groupId>cn.hutool</groupId>
    <artifactId>hutool-http</artifactId>
    <version>5.1.2</version>
</dependency>
<dependency>
    <groupId>cn.hutool</groupId>
    <artifactId>hutool-json</artifactId>
    <version>5.1.2</version>
</dependency>

<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.13.1</version>
</dependency>

code

public static void main(String[] args) {
    try {
        String fileMode = "..";
        String baseurl = "https://jfinal.com";
        Document document = Jsoup.connect("https://jfinal.com/doc").get();
        for (Element it : document.select(".jf-doc2.jf-doc-menus ul>li>a")) {
            String pageHref = it.attr("href");
            String pageURL = baseurl + pageHref;
            Document documentx = Jsoup.connect(pageURL).get();
            documentx.select(".jf-doc2.jf-doc-menus ul>li>a").forEach(itx -> {
                itx.attr("href", fileMode + itx.attr("href") + ".html");
            });
            Elements scripts = documentx.select("script");
            for (Element script : scripts) {
                String src = script.attr("src");
                String scriptText;
                if (StrUtil.isNotBlank(src)) {
                    if (src.startsWith("http")) {
                        if (!src.contains(".js")) {
                            script.attr("src", "").remove();
                        }
                        //scriptText = HttpUtil.get(src);
                    } else {
                        if (src.contains("jfinal-com-4.0.js")) {
                            script.attr("src", "").remove();
                            continue;
                        }
                        src = src.substring(0, src.lastIndexOf(".js") + 3);
                        if (!src.startsWith("/")) {
                            System.out.println("非/开头的script");
                            System.out.println(src);
                        }
                        script.attr("src", fileMode + src);
                        File file = new File("./jfinal", src);
                        if (!file.exists()) {
                            scriptText = HttpUtil.get(baseurl + src);
                            try {
                                FileUtil.writeString(scriptText, file, Charset.defaultCharset());
                            } catch (IORuntimeException e) {
                                e.printStackTrace();
                            }
                        }

                    }
                }
            }

            scripts = documentx.select("link[type=\"text/css\"]");
            for (Element link : scripts) {
                String href = link.attr("href");
                String scriptText;
                if (StrUtil.isNotBlank(href)) {
                    href = href.substring(0, href.lastIndexOf(".css") + 4);
                    if (href.startsWith("http")) {
                        String n = IdUtil.fastSimpleUUID();
                        String url = "/assets/css/" + n + ".css";
                        link.attr("href", fileMode + url);
                        HttpUtil.downloadFile(href, new File("./jfinal", url));
                    } else {
                        if (!href.startsWith("/")) {
                            System.out.println("非/开头的style");
                            System.out.println(href);
                        }
                        link.attr("href", fileMode + href);
                        File file = new File("./jfinal", href);
                        if (!file.exists()) {
                            File parent = file.getParentFile();
                            if (!parent.exists())
                                parent.mkdirs(); // 路径中包含小数点 , 会创建失败
                            scriptText = HttpUtil.get(baseurl + href);
                            try {
                                FileUtil.writeString(scriptText, file, Charset.defaultCharset());
                            } catch (IORuntimeException e) {
                                System.out.println(href);
                                e.printStackTrace();
                            }
                        }

                    }
                }
            }
            scripts = documentx.select("img");
            for (Element img : scripts) {
                String src = img.attr("src");
                if (StrUtil.isNotBlank(src)) {
                    if (src.startsWith("http")) {
                        String n = IdUtil.fastSimpleUUID();
                        String url = "/img/" + n + ".jpg";
                        img.attr("src", fileMode + url);
                        HttpUtil.downloadFile(src, new File("./jfinal", url));
                    } else {
                        img.attr("src", fileMode + src);
                        File file = new File("./jfinal", src);
                        File parent = file.getParentFile();
                        if (!parent.exists()) {
                            parent.mkdirs();
                        }
                        HttpUtil.downloadFile(baseurl + src, new File("./jfinal", src));
                    }
                }
            }

            documentx.select(".doc-pre-next-box a").forEach(itx->{
                itx.attr("href", fileMode + itx.attr("href") + ".html");
            });

            File file = new File("./jfinal", pageHref + ".html");
            FileUtil.writeString(documentx.toString(), file, Charset.defaultCharset());
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
}

使用姿势, 新建maven工程, 项目根目录新建名为jfinal目录, 新建java类,粘贴运行main方法, 直接在浏览器打开


评论区

小佳

2020-03-12 09:03

对一些初学者掌握爬虫技术分析页面的技术来说,是一项福音呦

jfinal4cyy

2020-03-12 09:21

感谢,已成功爬取

JFinal

2020-03-12 23:00

代码很简洁, jsoup 技术选型极好,其使用类似 jquery 选择器的方式定位需要解析的数据,方便强大,赞

快乐的蹦豆子

2020-03-13 09:51

@山东小木 其实天爱也不错

山东小木

2020-03-13 21:30

热门分享

扫码入社