如何使用PDFBox读取PDF部门(标题、摘要、参考资料)？开发者社区

文章/答案/技术大牛

发布

public class PDFTextSectionStripper extends PDFTextStripper
    // constructor
    public PDFTextSectionStripper(List<TextSectionDefinition> sectionDefinitions) throws IOException
        super();
        this.sectionDefinitions = sectionDefinitions;
    // Section retrieval
     * @return an unmodifiable list of text sections recognized during {@link #getText(PDDocument)}.
    public List<TextSection> getSections()
        return Collections.unmodifiableList(sections);
    // PDFTextStripper overrides
    @Override
    protected void writeLineSeparator() throws IOException
        super.writeLineSeparator();
        if (!currentLine.isEmpty())
            boolean matched = false;
            if (!(currentHeader.isEmpty() && currentBody.isEmpty()))
                TextSectionDefinition definition = sectionDefinitions.get(currentSectionDefinition);
                switch (definition.multiLine)
                case multiLine:
                    if (definition.matchPredicate.test(currentLine))
                        currentBody.add(new ArrayList<>(currentLine));
                        matched = true;
                    break;
                case multiLineHeader:
                case multiLineIntro:
                    boolean followUpMatch = false;
                    for (int i = definition.multiple ? currentSectionDefinition : currentSectionDefinition + 1;
                            i < sectionDefinitions.size(); i++)
                        TextSectionDefinition followUpDefinition = sectionDefinitions.get(i);
                        if (followUpDefinition.matchPredicate.test(currentLine))
                            followUpMatch = true;
                            break;
                    if (!followUpMatch)
                        currentBody.add(new ArrayList<>(currentLine));
                        matched = true;
                    break;
                case singleLine:
                    System.out.println("Internal error: There can be no current header or body as long as the current definition is single line only");
                if (!matched)
                    sections.add(new TextSection(definition, currentHeader, currentBody));
                    currentHeader.clear();
                    currentBody.clear();
                    if (!definition.multiple)
                        currentSectionDefinition++;
            if (!matched)
                while (currentSectionDefinition < sectionDefinitions.size())
                    TextSectionDefinition definition = sectionDefinitions.get(currentSectionDefinition);
                    if (definition.matchPredicate.test(currentLine))
                        matched = true;
                        switch (definition.multiLine)
                        case singleLine:
                            sections.add(new TextSection(definition, currentLine, Collections.emptyList()));
                            if (!definition.multiple)
                                currentSectionDefinition++;
                            break;
                        case multiLineHeader:
                            currentHeader.addAll(new ArrayList<>(currentLine));
                            break;
                        case multiLine:
                        case multiLineIntro:
                            currentBody.add(new ArrayList<>(currentLine));
                            break;
                        break;
                    currentSectionDefinition++;
            if (!matched)
                System.out.println("Could not match line.");
        currentLine.clear();
    @Override
    protected void endDocument(PDDocument document) throws IOException
        super.endDocument(document);
        if (!(currentHeader.isEmpty() && currentBody.isEmpty()))
            TextSectionDefinition definition = sectionDefinitions.get(currentSectionDefinition);
            sections.add(new TextSection(definition, currentHeader, currentBody));
            currentHeader.clear();
            currentBody.clear();
    @Override
    protected void writeString(String text, List<TextPosition> textPositions) throws IOException
        super.writeString(text, textPositions);
        currentLine.add(textPositions);
    // member variables
    final List<TextSectionDefinition> sectionDefinitions;
    int currentSectionDefinition = 0;
    final List<TextSection> sections = new ArrayList<>();
    final List<List<TextPosition>> currentLine = new ArrayList<>();
    final List<List<TextPosition>> currentHeader = new ArrayList<>();
    final List<List<List<TextPosition>>> currentBody = new ArrayList<>();
}

public class TextSectionDefinition
    public enum MultiLine
        singleLine,         // A single line without text body, e.g. title
        multiLine,          // Multiple lines, all match predicate, e.g. emails  
        multiLineHeader,    // Multiple lines, first line matches as header, e.g. h1
        multiLineIntro      // Multiple lines, first line matches inline, e.g. abstract
    public TextSectionDefinition(String name, Predicate<List<List<TextPosition>>> matchPredicate, MultiLine multiLine, boolean multiple)
        this.name = name;
        this.matchPredicate = matchPredicate;
        this.multiLine = multiLine;
        this.multiple = multiple;
    final String name;
    final Predicate<List<List<TextPosition>>> matchPredicate;
    final MultiLine multiLine;
    final boolean multiple;
}

public class TextSection
    public TextSection(TextSectionDefinition definition, List<List<TextPosition>> header, List<List<List<TextPosition>>> body)
        this.definition = definition;
        this.header = new ArrayList<>(header);
        this.body = new ArrayList<>(body);
    @Override
    public String toString()
        StringBuilder stringBuilder = new StringBuilder();
        stringBuilder.append(definition.name).append(": ");
        if (!header.isEmpty())
            stringBuilder.append(toString(header));
        stringBuilder.append('\n');
        for (List<List<TextPosition>> bodyLine : body)
            stringBuilder.append("    ").append(toString(bodyLine)).append('\n');
        return stringBuilder.toString();
    String toString(List<List<TextPosition>> words)
        StringBuilder stringBuilder = new StringBuilder();
        boolean first = true;
        for (List<TextPosition> word : words)
            if (first)
                first = false;
                stringBuilder.append(' ');
            for (TextPosition textPosition : word)
                stringBuilder.append(textPosition.getUnicode());
        // cf. https://stackoverflow.com/a/7171932/1729265
        return Normalizer.normalize(stringBuilder, Form.NFKC);
    final TextSectionDefinition definition;
    final List<List<TextPosition>> header;
    final List<List<List<TextPosition>>> body;
}

List<TextSectionDefinition> sectionDefinitions = Arrays.asList(
        new TextSectionDefinition("Titel", x->x.get(0).get(0).getFont().getName().contains("CMBX12"), MultiLine.singleLine, false),
        new TextSectionDefinition("Authors", x->x.get(0).get(0).getFont().getName().contains("CMR10"), MultiLine.multiLine, false),
        new TextSectionDefinition("Institutions", x->x.get(0).get(0).getFont().getName().contains("CMR9"), MultiLine.multiLine, false),
        new TextSectionDefinition("Addresses", x->x.get(0).get(0).getFont().getName().contains("CMTT9"), MultiLine.multiLine, false),
        new TextSectionDefinition("Abstract", x->x.get(0).get(0).getFont().getName().contains("CMBX9"), MultiLine.multiLineIntro, false),
        new TextSectionDefinition("Section", x->x.get(0).get(0).getFont().getName().contains("CMBX12"), MultiLine.multiLineHeader, true)
PDDocument document = PDDocument.load(resource);
PDFTextSectionStripper stripper = new PDFTextSectionStripper(sectionDefinitions);
stripper.getText(document);
System.out.println("Sections:");
List<String> texts = new ArrayList<>();
for (TextSection textSection : stripper.getSections())
    String text = textSection.toString();
    System.out.println(text);
    texts.add(text);
Files.write(new File(RESULT_FOLDER, "Wang05a.txt").toPath(), texts);

Titel: How to Break MD5 and Other Hash Functions
Authors: 
    Xiaoyun Wang and Hongbo Yu
Institutions: 
    Shandong University, Jinan 250100, China,
Addresses: 
    xywang@sdu.edu.cn, yhb@mail.sdu.edu.cn
Abstract: 
    Abstract. MD5 is one of the most widely used cryptographic hash func-
    tions nowadays. It was designed in 1992 as an improvement of MD4, and
Section: 1 Introduction
    People know that digital signatures are very important in information security.
    The security of digital signatures depends on the cryptographic strength of the
Section: 2 Description of MD5
    In order to conveniently describe the general structure of MD5, we first recall
    the iteration process for hash functions.
Section: 3 Differential Attack for Hash Functions
    3.1 The Modular Differential and the XOR Differential
    The most important analysis method for hash functions is differential attack
Section: 4 Differential Attack on MD5
    4.1 Notation
    Before presenting our attack, we first introduce some notation to simplify the
Section: 5 Summary
    In this paper we described a powerful attack against hash functions, and in
    particular showed that finding a collision of MD5 is easily feasible.
Section: Acknowledgements
    It is a pleasure to acknowledge Dengguo Feng for the conversations that led to
    this research on MD5. We would like to thank Eli Biham, Andrew C. Yao, and

问 如何使用PDFBox读取PDF部门(标题、摘要、参考资料)？ EN

回答 1

Stack Overflow用户