如何给PDF文件加目录？

Question

如何给PDF文件加目录？

关注者

55

被浏览

490,432

登录后你可以

不限量看优质回答私信答主深度交流精彩内容一键收藏

查看全部 19 个回答

介绍一种适合程序员的方法，只需要有 python 编程环境即可，不需要安装第三方 PDF 软件。

① 事前准备

首先安装 python 第三方库 PyPDF2： pip install PyPDF2

注意：原版 PyPDF2 在读取某些 PDF 时存在 bug，需要根据 https:// github.com/mstamy2/PyPD F2/issues/368 中提到的方法对源代码进行简单的修改，即把 ${PYTHON_PATH}/site-packages/PyPDF2/pdf.py 中的
dest = Destination(NameObject("/"+title + " bookmark"), pageRef, NameObject(fit), *zoomArgs)
改成
dest = Destination(NullObject(), pageRef, NameObject(fit), *zoomArgs)

然后创建包含以下代码的脚本（假设命名为 PDFbookmark.py ）：

import re
import sys
from distutils.version import LooseVersion
from os.path import exists, splitext
from PyPDF2 import PdfFileReader, PdfFileWriter
is_python2 = LooseVersion(sys.version) < '3'
def _get_parent_bookmark(current_indent, history_indent, bookmarks):
    '''The parent of A is the nearest bookmark whose indent is smaller than A's
    assert len(history_indent) == len(bookmarks)
    if current_indent == 0:
        return None
    for i in range(




    
len(history_indent) - 1, -1, -1):
        # len(history_indent) - 1   ===>   0
        if history_indent[i] < current_indent:
            return bookmarks[i]
    return None
def addBookmark(pdf_path, bookmark_txt_path, page_offset):
    if not exists(pdf_path):
        return "Error: No such file: {}".format(pdf_path)
    if not exists(bookmark_txt_path):
        return "Error: No such file: {}".format(bookmark_txt_path)
    with open(bookmark_txt_path, 'r', encoding='utf-8') as f:
        bookmark_lines = f.readlines()
    reader = PdfFileReader(pdf_path)
    writer = PdfFileWriter()
    writer.cloneDocumentFromReader(reader)
    maxPages = reader.getNumPages()
    bookmarks, history_indent = [], []
    # decide the level of each bookmark according to the relative indent size in each line
    #   no indent:          level 1
    #     small indent:     level 2
    #       larger indent:  level 3
    #   ...
    for line in bookmark_lines:
        line2 = re.split(r'\s+', unicode(line.strip(), 'utf-8')) if is_python2 else re.split(r'\s+', line.strip())
        if len(line2) == 1:
            continue
        indent_size = len(line) - len(line.lstrip())
        parent = _get_parent_bookmark(indent_size, history_indent, bookmarks)
        history_indent.append(indent_size)
        title, page = ' '.join(line2[:-1]), int(line2[-1]) - 1
        if page + page_offset >= maxPages:
            return "Error: page index out of range: %d >= %d" % (page + page_offset, maxPages)
        new_bookmark = writer.addBookmark(title, page + page_offset, parent=parent)
        bookmarks.append(new_bookmark)
    out_path = splitext(pdf_path)[0] + '-new.pdf'
    with open(out_path,'wb') as f:
        writer.write(f)
    return "The bookmarks have been added to %s" % out_path
if __name__ == "__main__":
    import sys
    args = sys.argv
    if len(args) != 4:
        print("Usage: %s [pdf] [bookmark_txt] [page_offset]" % args[0])
    else:
        print(addBookmark(args[1], args[2], int(args[3])))

补充：上述代码中的 page_offset 一般设成 0 即可，它只会在特殊的情况下使用，即 PDF 自带的目录中页码和实际页码存在偏差（通常是一个固定的偏移）的情况，这时只需要将 page_offset 改成实际相应的偏差页数即可。

准备一个 txt 文本文件（假设命名为 toc.txt ），在其中手动录入需要添加的目录，或者从文本型 PDF 中直接复制出目录，采用类似下面格式即可：

Introduction                            14
I. Interview Questions                            99
    Data Structures                            100
    Chapter 1 | Arrays and Strings                            100
        Hash Tables                            100
        StringBuilder                            101
    Chapter 2 | Linked Lists                            104
        Creating a Linked List                            104
        The "Runner" Technique                            105
    Additional Review Problems                            193

唯一的要求是每一行的最后一项是页码（前面的空白符个数不限），并且相同级别的书签要采用相同的缩进量（空格或 tab 都可以）。

【注意】如果是在 Windows 上创建 txt 文档的话，需要注意将文件保存为 UTF-8 编码格式，不然的话其中的汉字用第 ① 步的脚本读取出来就是乱码了。（Windows 上直接创建的 txt 文档一般默认都是 ANSI 编码格式）

然后在第 ① 步中创建的 PDFbookmark.py 所在目录下打开终端/命令提示符，运行以下代码即可：

其中最后一个参数 9 表示将 toc.txt 文件中的页码全部偏移 +9（即全部加上 9）

python ./PDFbookmark.py '/Users/Emrys/Desktop/demo 2015.pdf' '/Users/Emrys/Desktop/toc.txt' 9

运行之后，会在要添加目录的 PDF 文件同目录下面生成一个新的 PDF 文件，名字为 [原 PDF 文件名]-new.pdf 。

演示效果如下：

【补充】

如果上面脚本要处理的 PDF 文件之前被其他 PDF 编辑器软件修改过目录/书签的话，可能会导致类似下面的报错：

Traceback (most recent call last):
  File ".\PDFbookmark.py", line 70, in <module>
    print(addBookmark(args[1], args[2], int(args[3])))
  File ".\PDFbookmark.py", line 55, in addBookmark
    new_bookmark = writer.addBookmark(title, page + page_offset, parent=parent)
  File "C:\Anaconda3\lib\site-packages\PyPDF2\pdf.py", line 732, in addBookmark
    outlineRef = self.getOutlineRoot()
  File "C:\Anaconda3\lib\site-packages\PyPDF2\pdf.py", line 607,




    
 in getOutlineRoot
    idnum = self._objects.index(outline) + 1
ValueError: {'/Type': '/Outlines', '/Count': 0} is not in list

这种情况下，需要把 PyPDF2 源码中 pdf.py 文件的 getOutlineRoot() 函数修改一下 ^[1] （源码文件路径是 ${PYTHON_PATH}/site-packages/PyPDF2/pdf.py）：

    def getOutlineRoot(self):
        if '/Outlines' in self._root_object:
            outline = self._root_object['/Outlines']
            try:
                idnum = self._objects.index(outline) + 1
            except ValueError:
                if not isinstance(outline, TreeObject):
                    def _walk(node):
                        node.__class__ = TreeObject
                        for child in node.children():
                            _walk(child)
                            if child not in self._objects:
                                self._addObject(child)
                    _walk(outline)
                outlineRef = self._addObject(outline)
                self._root_object[NameObject('/Outlines')] = outlineRef
                idnum = self._objects.index(outline) + 1
            outlineRef = IndirectObject(idnum, 0, self)
            assert outlineRef.getObject() == outline
        else:
            outline = TreeObject()
            outline.update({ })
            outlineRef = self._addObject(outline)
            self._root_object[NameObject('/Outlines')] = outlineRef
        return outline

如果你在做了上述修改之后，运行脚本时遇到了 RuntimeError: generator raised StopIteration 的错误，请检查一下是不是 Python 版本大于等于 3.7（因为从 v3.7 版本之后，Python 对终止迭代的方式处理发生了变化，详情可以参考 PEP 479 ），如果是的话，请尝试使用 v3.7 之前版本的 Python 来运行上面的代码。
另一种解决方案是尝试将 PDF 中已有的目录/书签删除掉，然后再运行 Python 脚本。

如果在 Mac OS 上，还可以将上述代码和 Alfred 结合 ^[2] ，操作更加方便：

【更新】顺便介绍一下用 Python 导出 PDF 中目录/书签的方法：

import sys
from distutils.version import LooseVersion
from os.path import exists
from PyPDF2 import PdfFileReader
is_python2 = LooseVersion(sys.version) < '3'
def _parse_outline_tree(reader, outline_tree, level=0):
    """Return List[Tuple[level(int), page(int), title(str)]]"""
    ret = []
    for heading in outline_tree:
        if isinstance(heading, list):
            # contains sub-headings
            ret.extend(_parse_outline_tree(reader, heading, level=level+1))
        else:
            ret.append((level, reader.getDestinationPageNumber(heading), heading.title))
    return ret
def extractBookmark(pdf_path, bookmark_txt_path):
    if not exists(pdf_path):
        return "Error: No such file: {}".format(pdf_path)
    if exists(bookmark_txt_path):
        print("Warning: Overwritting {}".format(bookmark_txt_path))
    reader = PdfFileReader(pdf_path)
    # List of ('Destination' objects) or ('Destination' object lists)
    #  [{'/Type': '/Fit', '/Title': u'heading', '/Page': IndirectObject(6, 0)}, ...]
    outlines = reader.outlines
    # List[Tuple[level(int), page(int), title(str)]]
    outlines = _parse_outline_tree(reader, outlines)
    max_length = max(len(item[-1]) + 2 * item[0] for item in outlines) + 1
    # print(outlines)
    with open(bookmark_txt_path, 'w') as f:
        for level, page, title in outlines:
            level_space = '  ' * level
            title_page_space = ' ' * (max_length - level * 2 - len(title))
            if is_python2:
                title = title.encode('utf-8')
            f.write("{}{}{}{}\n".format(level_space, title, title_page_space, page))
    return "The bookmarks have been exported to %s" % bookmark_txt_path
if __name__ == "__main__":
    import sys
    args = sys.argv
    if len(args) != 3:
        print("Usage: %s [pdf] [bookmark_txt]" % args[0])
    else:
        print(extractBookmark(args[1], args[




    
2]))

【更新】

最近尝试了一下另一个第三方库 pikepdf ，以下是基于 pikepdf （要求 Python 3.7 以上版本）的等效脚本：

需要安装 python 第三方库 pikepdf ：pip install pikepdf
（使用方法同上）

import io
import re
import sys
from distutils.version import LooseVersion
from pathlib import Path
from pikepdf import Array, Name, OutlineItem, Page, Pdf, String
if LooseVersion(sys.version) < "3.7":
    raise NotImplementedError("pikepdf requires Python 3.7+")
#################
# Add bookmarks #
#################
def _get_parent_bookmark(current_indent, history_indent, bookmarks):
    '''The parent of A is the nearest bookmark whose indent is smaller than A's
    assert len(history_indent) == len(bookmarks)
    if current_indent == 0:
        return None
    for i in range(len(history_indent) - 1, -1, -1):
        # len(history_indent) - 1   ===>   0
        if history_indent[i] < current_indent:
            return bookmarks[i]
    return None
def addBookmark(pdf_path, bookmark_txt_path, page_offset):
    if not Path(pdf_path).exists():
        return "Error: No such file: {}".format(pdf_path)
    if not Path(bookmark_txt_path).exists():
        return "Error: No such file: {}".format(bookmark_txt_path)
    with io.open(bookmark_txt_path, 'r', encoding='utf-8') as f:
        bookmark_lines = f.readlines()
    pdf = Pdf.open(pdf_path)
    maxPages = len(pdf.pages)
    bookmarks, history_indent = [], []
    # decide the level of each bookmark according to the relative indent size in each line
    #   no indent:          level 1
    #     small indent:     level 2
    #       larger indent:  level 3
    #   ...
    with pdf.open_outline() as outline:
        for line in bookmark_lines:
            line2 = re.split(r'\s+', line.strip())
            if len(line2) == 1:
                continue
            indent_size = len(line) - len(line.lstrip())
            parent = _get_parent_bookmark(indent_size, history_indent, bookmarks)
            history_indent.append(indent_size)
            title, page = ' '.join(line2[:-1]), int(line2[-1]) - 1
            if page + page_offset >= maxPages:
                return "Error: page index out of range: %d >= %d" % (page + page_offset, maxPages)
            new_bookmark = OutlineItem(title, page + page_offset)
            if parent is None:
                outline.root.append(new_bookmark)
            else:
                parent.children.append(new_bookmark)
            bookmarks.append(new_bookmark)
    out_path = Path(pdf_path)
    out_path = out_path.with_name(out_path.stem + "-new.pdf")
    pdf.save(out_path)
    return "The bookmarks have been added to %s" % out_path
#####################
# Extract bookmarks #
#####################
def _getDestinationPageNumber(outline, names):
    def find_dest(ref, names):
        resolved = None
        if isinstance(ref, Array):
            resolved = ref[0]
        else:
            for n in range(0, len(names) - 1, 2):
                if names[n] == ref:
                    if names[n+1]._type_name == 'array':
                        named_page = names[n+1][0]
                    elif names[n+1]._type_name == 'dictionary':
                        named_page = names[n+1].D[0]
                    else:
                        raise TypeError("Unknown type: %s" % type(names[n+1]))
                    resolved = named_page
                    break
        if resolved is not None:
            return Page(resolved).index
    if outline.destination is not None:
        if isinstance(outline.destination, Array):
            # 12.3.2.2 Explicit destination
            # [raw_page, /PageLocation.SomeThing, integer parameters for viewport]
            raw_page = outline.destination[0]
            try:
                page = Page(raw_page)
                dest = page.index
            except:
                dest = find_dest(outline.destination, names)
        elif isinstance(outline.destination, String):
            # 12.3.2.2 Named destination, byte string reference to Names
            # dest = f'<Named Destination in document .Root.Names dictionary: {outline.destination}>'
            assert names is not None
            dest = find_dest(outline.destination, names)
        elif isinstance(outline.destination, Name):
            # 12.3.2.2 Named desintation, name object (PDF 1.1)
            # dest = f'<Named Destination in document .Root.Dests dictionary: {outline.destination}>'
            dest = find_dest(outline.destination, names)
        elif isinstance(outline.destination, int):
            # Page number
            dest = outline.destination
        else:
            dest = outline.destination
        return dest
    else:
        return find_dest(outline.action.D, names)
def _parse_outline_tree(outlines, level=0, names=None):
    """Return List[Tuple[level(int), page(int), title(str)]]"""
    ret = []
    if isinstance(outlines, (list, tuple)):
        for heading in outlines:
            # contains sub-headings
            ret.extend(_parse_outline_tree(heading, level=level, names=names))
    else:
        ret.append((level, _getDestinationPageNumber(outlines, names) + 1, outlines.title))
        for subheading in outlines.children:
            # contains sub-headings
            ret.extend(_parse_outline_tree(subheading, level=level+1, names=names))
    return ret
def extractBookmark(pdf_path, bookmark_txt_path):
    # https://github.com/pikepdf/pikepdf/issues/149#issuecomment-860073511
    def has_nested_key(obj, keys):
        ok = True
        to_check = obj
        for key in keys:
            if key in to_check.keys():
                to_check = to_check[key]
            else:
                ok = False
                break
        return ok
    def get_names(pdf):
        if has_nested_key(pdf.Root, ['/Names', '/Dests']):
            obj = pdf.Root.Names.Dests
            names = []
            ks = obj.keys()
            if '/Names' in ks:
                names.extend(obj.Names)
            elif '/Kids' in ks:
                for k in obj.Kids:
                    names.extend(get_names(k))
            else:
                assert False
            return names
        else:
            return None
    if not Path(pdf_path).exists():
        return "Error: No such file: {}".format(pdf_path)
    if Path(bookmark_txt_path).exists():
        print("Warning: Overwritting {}".format(bookmark_txt_path))
    pdf = Pdf.open(pdf_path)
    names = get_names(pdf)
    with pdf.open_outline() as outline:
        outlines = _parse_outline_tree(outline.root, names=names)
    if len(outlines) == 0:
        return "No bookmark is found in %s" % pdf_path
    # List[Tuple[level(int), page(int), title(str)]]
    max_length = max(len(item[-1]) + 2 * item[0] for item in outlines) + 1
    # print(outlines)
    with open(bookmark_txt_path, 'w') as f:
        for level, page, title in outlines:
            level_space = '  ' * level
            title_page_space = ' ' * (max_length - level * 2 - len(title))
            f.write("{}{}{}{}\n".format(level_space, title, title_page_space, page))
    return "The bookmarks have been exported to %s" % bookmark_txt_path
if __name__ == "__main__":
    import sys