AozoraEpub3の見出しのバグ内容　修正方法の検討 - KDP(電子出版)のメモ　急急如律令

特定の条件で発生するようなのだが、発生しないように修正した。その他に気になったところ。

ｈ１タグにPタグで囲ってしまう。
画像の説明alt属性
画像のメモリ消費が激しい。

「見出しでh1タグの場合にpタグで囲まれてしまう」の修正コード

99nyorituryo.hatenablog.com

あなた:

次のコードを解説して

 /** 行の文字列を出力
     * 改ページフラグがあれば改ページ処理を行う
     * @param out 出力先
     * @param buf 出力する行
     * @param noBr pタグで括れない次以降の行で閉じるブロック注記がある場合
     * //@param chapterLevel Chapterレベル 指定無し=0, 大見出し=1, 中見出し=2, 見出し=2, 小見出し=3 (パターン抽出時は設定に合わせるか目次リストで選択したレベル)
     * @throws IOException */
    private void printLineBuffer(BufferedWriter out, StringBuilder buf, int lineNum, boolean noBr) throws IOException
    {
        String line = buf.toString();
        int length = buf.length();
        //すべて空白は空行にする
        if (CharUtils.isSpace(line)) { line = ""; length = 0; }

        int idIdx = 1;
        String chapterId = null;

        ChapterLineInfo chapterLineInfo = null;
        //空白除去の時はスペースのみの行は空行扱い
        if (this.removeEmptyLine > 0 && length > 0 && CharUtils.isSpace(line)) {
            line = "";
            length = 0;
        }
        if (length == 0) {
            //空行なら行数をカウント 左右中央の時の本文前の空行は無視
            if (!this.skipMiddleEmpty && !noBr) {
                this.printEmptyLines++;
            }
            //バッファクリア
            buf.setLength(0);
            return;
        }

        //バッファ内の文字列出力
        //見出し階層レベル
        chapterLineInfo = this.bookInfo.getChapterLineInfo(lineNum);

        //タグの階層をチェック (強制改ページ判別用に先にやっておく)
        int tagStart = 0;
        int tagEnd = 0;
        boolean inTag = false;
        for (int i=0; i<length; i++) {
            if (inTag) {
                if (line.charAt(i) == '/' && line.charAt(i+1) == '>') tagEnd++;
                if (line.charAt(i) == '>') inTag = false;
            } else {
                if (line.charAt(i) == '<') {
                    if (i<length-1 && line.charAt(i+1) == '/') tagEnd++;
                    else tagStart++;
                    inTag = true;
                }
            }
        }

        if (out != null) {

        //強制改ページ処理
        //改ページトリガが設定されていない＆タグの外
        if (this.forcePageBreak && this.pageBreakTrigger == null && this.tagLevel == 0) {
            //行単位で強制改ページ
            if (this.pageByteSize > this.forcePageBreakSize) {
                this.setPageBreakTrigger(pageBreakNoChapter);
            } else {
                if (forcePageBreakEmptyLine > 0 && this.printEmptyLines >= forcePageBreakEmptyLine && this.pageByteSize > this.forcePageBreakEmptySize) {
                    //空行での分割
                    this.setPageBreakTrigger(pageBreakNoChapter);
                } else if (forcePageBreakChapterLevel > 0 && this.pageByteSize > this.forcePageBreakChapterSize) {
                    //章での分割 次の行が見出しで次の行がタグの中になる場合１行前で改ページ
                    if (chapterLineInfo != null) this.setPageBreakTrigger(pageBreakNoChapter);
                    else if (tagStart-tagEnd > 0 && this.bookInfo.getChapterLevel(lineNum+1) > 0) this.setPageBreakTrigger(pageBreakNoChapter);
                }
            }
        }

        //改ページフラグが設定されていて、空行で無い場合
        if (this.pageBreakTrigger != null) {
            //空ページでの改ページ
            //if (sectionCharLength == 0) {
            //  out.write(chukiMap.get("改行")[0]);
            //}

            //改ページ処理
            if (this.pageBreakTrigger.pageType != PageBreakType.PAGE_NORMAL) {
                //左右中央
                this.writer.nextSection(out, lineNum, this.pageBreakTrigger.pageType, PageBreakType.IMAGE_PAGE_NONE, null);
            } else {
                //その他
                this.writer.nextSection(out, lineNum, PageBreakType.PAGE_NORMAL, this.pageBreakTrigger.imagePageType, this.pageBreakTrigger.srcFileName);
            }

            //ページ情報初期化
            this.pageByteSize = 0;
            this.sectionCharLength = 0;
            if (tagLevel > 0) LogAppender.error(lineNum, "タグが閉じていません");
            this.tagLevel = 0;
            this.lineIdNum = 0;

            this.pageBreakTrigger = null;
        }

        this.skipMiddleEmpty = false;
        //空行は行数がカウントされているので文字出力前に出力
        if (this.printEmptyLines > 0) {
            String br = chukiMap.get("改行")[0];
            int lines = Math.min(this.maxEmptyLine, this.printEmptyLines-this.removeEmptyLine);
            //見出し後3行以内開始の空行は1行は残す
            if (lastChapterLine >= lineNum-this.printEmptyLines-2) {
                lines = Math.max(1, lines);
            }
            for (int i=lines-1; i>=0; i--) {
                out.write("<p>");
                out.write(br);
                out.write("</p>\n");
            }
            this.pageByteSize += (br.length()+8)*lines;
            this.printEmptyLines = 0;
        }

        this.lineIdNum++;
        if (noBr) {
            //見出し用のID設定
            if (chapterLineInfo != null) {
                chapterId = "kobo."+this.lineIdNum+"."+(idIdx++);
                if (line.startsWith("<")) {
                    //タグがあるのでIDを設定
                    line = line.replaceFirst("(<[\\d|\\w]+)", "$1 id=\""+chapterId+"\"");
                } else {
                    //タグでなければ一文字目をspanに入れる
                    out.write("<span id=\""+chapterId+"\">"+line.charAt(0)+"</span>");
                    this.pageByteSize += (chapterId.length() + 20);
                    line = line.substring(1);
                }
            }
        } else {
            //改行用のp出力 見出しなら強制ID出力 koboの栞用IDに利用可能なkobo.のIDで出力
            if (this.withMarkId || (chapterLineInfo != null && !chapterLineInfo.pageBreakChapter)) {
                chapterId = "kobo."+this.lineIdNum+"."+(idIdx++);
                out.write("<p id=\""+chapterId+"\">");
                this.pageByteSize += (chapterId.length() + 14);
            }
            else {
                out.write("<p>");
                this.pageByteSize += 7;
            }
        }
        out.write(line);
        //ページバイト数加算
        if (this.forcePageBreak) this.pageByteSize += line.getBytes(StandardCharsets.UTF_8).length;

        //改行のpを閉じる
        if (!noBr) {
            out.write("</p>\n");
        }

        //見出しのChapterをWriterに追加 同じ行で数回呼ばれるので初回のみ
        if (chapterLineInfo != null && lastChapterLine != lineNum) {
            String name = chapterLineInfo.getChapterName();
            if (name != null && !name.isEmpty()) {
                //自動抽出で+10されているのは1桁のレベルに戻す
                if (chapterLineInfo.pageBreakChapter) this.writer.addChapter(null, name, chapterLineInfo.level%10);
                else this.writer.addChapter(chapterId, name, chapterLineInfo.level%10);
                lastChapterLine = lineNum;
            }
        }

        this.sectionCharLength += length;

        }

        //タグの階層を変更
        this.tagLevel += tagStart-tagEnd;

        //バッファクリア
        buf.setLength(0);
    }

ChatGPT:

いいですね。このコードは、テキスト行（文字列バッファ）をHTML化して出力する処理で、電子書籍（EPUBなど）向けの整形処理に使われています。目的としては、

空行や見出し（章タイトル）を処理し、
タグで本文を囲み、
改ページのトリガー（page break）を判定・挿入するといった作業を行っています。

空行や見出し（章タイトル）を処理し、

<p>タグで本文を囲み、

改ページのトリガー（page break）を判定・挿入するといった作業を行っています。

private void printLineBuffer(BufferedWriter out, StringBuilder buf, int lineNum, boolean noBr)

引数	内容
out	出力先（HTMLファイルなど）
buf	出力対象の1行（StringBuilder形式）
lineNum	行番号
noBr	タグで囲まない特殊ケース（注記など）

int length = buf.length();
if (CharUtils.isSpace(line)) { line = ""; length = 0; }

String line = buf.toString();
int length = buf.length();
if (CharUtils.isSpace(line)) { line = ""; length = 0; }

空白だけの行を「空行」として扱う。
removeEmptyLine > 0 の場合は、余分な空行を除去。

空白だけの行を「空行」として扱う。

removeEmptyLine > 0 の場合は、余分な空行を除去。

    if (!this.skipMiddleEmpty && !noBr) {
        this.printEmptyLines++;
    }
    buf.setLength(0);
    return;
}

if (length == 0) {
    if (!this.skipMiddleEmpty && !noBr) {
        this.printEmptyLines++;
    }
    buf.setLength(0);
    return;
}

空行を見つけたら、後で

として出力するためにカウントしておく。
すぐには出力せず、次の本文行でまとめて出す。

空行を見つけたら、後で <p>``<br>``</p> として出力するためにカウントしておく。

すぐには出力せず、次の本文行でまとめて出す。

chapterLineInfo = this.bookInfo.getChapterLineInfo(lineNum);

現在の行が「章」「見出し」などに該当するかをチェック。
この情報には level（大見出し・中見出しなど）や name（章タイトル）が入る。

現在の行が「章」「見出し」などに該当するかをチェック。

この情報には level（大見出し・中見出しなど）や name（章タイトル）が入る。

for (int i=0; i<length; i++) {
    ...
}

int tagStart = 0, tagEnd = 0;
for (int i=0; i<length; i++) {
    ...
}

や
などの開始タグ・終了タグの個数を数えて、タグの「深さ」（階層レベル）を管理。
これにより「タグが閉じられていない」といったHTML不整合を検出できる。

<p> や <div> などの開始タグ・終了タグの個数を数えて、タグの「深さ」（階層レベル）を管理。

これにより「タグが閉じられていない」といったHTML不整合を検出できる。

  ...
}

if (this.forcePageBreak && this.pageBreakTrigger == null && this.tagLevel == 0) {
    ...
}

電子書籍用にページサイズや章単位での分割を自動判定。
文字数(pageByteSize)、空行数、見出しレベルなどを見て、

改ページサイズを超えた

空行が多い

次が新章などの条件で this.setPageBreakTrigger() を呼び出す。 - 改ページサイズを超えた - 空行が多い - 次が新章などの条件で this.setPageBreakTrigger() を呼び出す。

電子書籍用にページサイズや章単位での分割を自動判定。

文字数(pageByteSize)、空行数、見出しレベルなどを見て、

改ページサイズを超えた
空行が多い
次が新章などの条件で this.setPageBreakTrigger() を呼び出す。

改ページサイズを超えた

空行が多い

次が新章などの条件で this.setPageBreakTrigger() を呼び出す。

    this.writer.nextSection(out, lineNum, ...);
    this.pageByteSize = 0;
    this.sectionCharLength = 0;
    this.tagLevel = 0;
    this.pageBreakTrigger = null;
}

if (this.pageBreakTrigger != null) {
    this.writer.nextSection(out, lineNum, ...);
    this.pageByteSize = 0;
    this.sectionCharLength = 0;
    this.tagLevel = 0;
    this.pageBreakTrigger = null;
}

改ページが必要なら writer.nextSection() を呼び出し、新しいページを開始。
ページ内の文字数・タグ階層などを初期化。

改ページが必要なら writer.nextSection() を呼び出し、新しいページを開始。

ページ内の文字数・タグ階層などを初期化。

    String br = chukiMap.get("改行")[0];
    for (int i=lines-1; i>=0; i--) {
        out.write("<p>");
        out.write(br);
        out.write("</p>\n");
    }
    this.printEmptyLines = 0;
}

if (this.printEmptyLines > 0) {
    String br = chukiMap.get("改行")[0];
    for (int i=lines-1; i>=0; i--) {
        out.write("<p>");
        out.write(br);
        out.write("</p>\n");
    }
    this.printEmptyLines = 0;
}

前段でカウントしておいた空行を

形式で出力。
空行の最大数（maxEmptyLine）なども制御。

前段でカウントしておいた空行を <p>``<br>``</p> 形式で出力。

空行の最大数（maxEmptyLine）なども制御。

  // pタグで括らない場合（見出しなど）
} else {
    out.write("<p id=\"kobo.～\">");
}
out.write(line);
if (!noBr) out.write("</p>\n");

if (noBr) {
    // pタグで括らない場合（見出しなど）
} else {
    out.write("<p id=\"kobo.～\">");
}
out.write(line);
if (!noBr) out.write("</p>\n");

通常は
タグで囲って出力。
見出し行（chapterLineInfoあり）や注記行（noBr=true）の場合は個別処理。
Kobo用に "kobo.{行番号}.{連番}" の形式でIDを付与しており、電子書籍リーダーのしおり機能に対応。

通常は <p> タグで囲って出力。

見出し行（chapterLineInfoあり）や注記行（noBr=true）の場合は個別処理。

Kobo用に "kobo.{行番号}.{連番}" の形式でIDを付与しており、電子書籍リーダーのしおり機能に対応。

    this.writer.addChapter(chapterId, name, chapterLineInfo.level % 10);
    lastChapterLine = lineNum;
}

if (chapterLineInfo != null && lastChapterLine != lineNum) {
    this.writer.addChapter(chapterId, name, chapterLineInfo.level % 10);
    lastChapterLine = lineNum;
}

新しい見出し行を writer に登録（目次生成などに使われる）。

新しい見出し行を writer に登録（目次生成などに使われる）。

buf.setLength(0);

this.tagLevel += tagStart - tagEnd;
buf.setLength(0);

タグ階層の更新とバッファ初期化。

タグ階層の更新とバッファ初期化。

段階	処理内容
①	空白・空行を整理
②	見出し情報を取得
③	タグ階層の解析
④	改ページ判定
⑤	改ページ処理
⑥	溜まった空行の出力
⑦	本文出力（タグ）
⑧	見出し登録
⑨	状態リセット

このコードはAozoraEpub3やその派生系のような、「青空文庫テキスト → EPUB変換ツール」でよく見られる構造です。 BufferedWriter で逐次HTMLを出力しながら、

改ページ
見出し抽出
しおり用IDの埋め込みを一括して行う高機能な「1行出力関数」です。

改ページ

見出し抽出

しおり用IDの埋め込みを一括して行う高機能な「1行出力関数」です。

希望があれば、 👉 「この中の○○（例：改ページ処理・chapterLineInfo）」だけを詳しく図解といった形にもできます。どの部分をもう少し詳しく見たいですか？

あなた:

h1やdivのときにpタグを付与しないようにしたい

ChatGPT:

良いですね。つまり「<h1> や <div> など、すでにブロック要素で始まる行には <p>〜</p> を付けたくない」ということですね。今のコードでは、noBr フラグや chapterLineInfo の有無に応じて <p> タグを出力していますが、ブロック要素の自動判定はしていません。

次のような処理を追加するのがシンプルで安全です。

>|boolean isBlockTag = line.matches("^\\s*<(h\\d|div|table|ul|ol|li|blockquote|section|article|header|footer)\\b.*");|

これで行頭が <h1> や <div> のようなタグで始まっていれば true になります。そしてこの isBlockTag が true の場合は <p> タグを出力しないように条件を追加します。

元コードのこの部分：

  //見出し用のID設定
    if (chapterLineInfo != null) {
        chapterId = "kobo."+this.lineIdNum+"."+(idIdx++);
        if (line.startsWith("<")) {
            //タグがあるのでIDを設定
            line = line.replaceFirst("(<[\\d|\\w]+)", "$1 id=\""+chapterId+"\"");
        } else {
            out.write("<span id=\""+chapterId+"\">"+line.charAt(0)+"</span>");
            this.pageByteSize += (chapterId.length() + 20);
            line = line.substring(1);
        }
    }
} else {
    //改行用のp出力 見出しなら強制ID出力 koboの栞用IDに利用可能なkobo.のIDで出力
    if (this.withMarkId || (chapterLineInfo != null && !chapterLineInfo.pageBreakChapter)) {
        chapterId = "kobo."+this.lineIdNum+"."+(idIdx++);
        out.write("<p id=\""+chapterId+"\">");
        this.pageByteSize += (chapterId.length() + 14);
    }
    else {
        out.write("<p>");
        this.pageByteSize += 7;
    }
}
out.write(line);
if (!noBr) {
    out.write("</p>\n");
}

if (noBr) {
    //見出し用のID設定
    if (chapterLineInfo != null) {
        chapterId = "kobo."+this.lineIdNum+"."+(idIdx++);
        if (line.startsWith("<")) {
            //タグがあるのでIDを設定
            line = line.replaceFirst("(<[\\d|\\w]+)", "$1 id=\""+chapterId+"\"");
        } else {
            out.write("<span id=\""+chapterId+"\">"+line.charAt(0)+"</span>");
            this.pageByteSize += (chapterId.length() + 20);
            line = line.substring(1);
        }
    }
} else {
    //改行用のp出力 見出しなら強制ID出力 koboの栞用IDに利用可能なkobo.のIDで出力
    if (this.withMarkId || (chapterLineInfo != null && !chapterLineInfo.pageBreakChapter)) {
        chapterId = "kobo."+this.lineIdNum+"."+(idIdx++);
        out.write("<p id=\""+chapterId+"\">");
        this.pageByteSize += (chapterId.length() + 14);
    }
    else {
        out.write("<p>");
        this.pageByteSize += 7;
    }
}
out.write(line);
if (!noBr) {
    out.write("</p>\n");
}

これを次のように変更します👇

boolean isBlockTag = line.matches("^\\s*<(h\\d|div|table|ul|ol|li|blockquote|section|article|header|footer)\\b.*");

if (noBr || isBlockTag) {
    // 見出し用のID設定（pタグで囲まない）
    if (chapterLineInfo != null) {
        chapterId = "kobo."+this.lineIdNum+"."+(idIdx++);
        if (line.startsWith("<")) {
            //タグがあるのでIDを設定
            line = line.replaceFirst("(<[\\d|\\w]+)", "$1 id=\""+chapterId+"\"");
        } else {
            out.write("<span id=\""+chapterId+"\">"+line.charAt(0)+"</span>");
            this.pageByteSize += (chapterId.length() + 20);
            line = line.substring(1);
        }
    }
    out.write(line);
    out.write("\n");
} else {
    // 通常の本文行を <p>〜</p> で出力
    if (this.withMarkId || (chapterLineInfo != null && !chapterLineInfo.pageBreakChapter)) {
        chapterId = "kobo."+this.lineIdNum+"."+(idIdx++);
        out.write("<p id=\""+chapterId+"\">");
        this.pageByteSize += (chapterId.length() + 14);
    } else {
        out.write("<p>");
        this.pageByteSize += 7;
    }

    out.write(line);
    out.write("</p>\n");
}

boolean isBlockTag = line.matches("^\\s*<(h\\d|div|table|ul|ol|li|blockquote|section|article|header|footer)\\b.*");

if (noBr || isBlockTag) {
    // 見出し用のID設定（pタグで囲まない）
    if (chapterLineInfo != null) {
        chapterId = "kobo."+this.lineIdNum+"."+(idIdx++);
        if (line.startsWith("<")) {
            //タグがあるのでIDを設定
            line = line.replaceFirst("(<[\\d|\\w]+)", "$1 id=\""+chapterId+"\"");
        } else {
            out.write("<span id=\""+chapterId+"\">"+line.charAt(0)+"</span>");
            this.pageByteSize += (chapterId.length() + 20);
            line = line.substring(1);
        }
    }
    out.write(line);
    out.write("\n");
} else {
    // 通常の本文行を <p>〜</p> で出力
    if (this.withMarkId || (chapterLineInfo != null && !chapterLineInfo.pageBreakChapter)) {
        chapterId = "kobo."+this.lineIdNum+"."+(idIdx++);
        out.write("<p id=\""+chapterId+"\">");
        this.pageByteSize += (chapterId.length() + 14);
    } else {
        out.write("<p>");
        this.pageByteSize += 7;
    }

    out.write(line);
    out.write("</p>\n");
}

isBlockTag は <h1>〜<h6>、<div>、<section>などブロックレベル要素で始まる行を検出します。
noBr は元々の「pタグを付けない」指定なので、それも併せて除外条件に。
の閉じ忘れを防ぐために、条件ごとに明示的に

を書いています。

isBlockTag は <h1>〜<h6>、<div>、<section>などブロックレベル要素で始まる行を検出します。

noBr は元々の「pタグを付けない」指定なので、それも併せて除外条件に。

<p> の閉じ忘れを防ぐために、条件ごとに明示的に </p> を書いています。

この変更を入れると：

入力行	出力
これは本文です。	`<p>`これは本文です。`</p>`
`<h1>`章タイトル`</h1>`	`<h1>`章タイトル`</h1>`
`<div class="note">`注釈`</div>`	`<div class="note">`注釈`</div>`
`<p>`元からpタグがある行`</p>`	`<p>`元からpタグがある行`</p>`（重複しない）