Skip to content
Snippets Groups Projects
pdfjs.parser.js 367 KiB
Newer Older
  • Learn to ignore specific revisions
  • Alexey Lunin's avatar
    Alexey Lunin committed
    14001 14002 14003 14004 14005 14006 14007 14008 14009 14010 14011 14012 14013 14014 14015 14016 14017 14018 14019 14020 14021 14022 14023 14024 14025 14026 14027 14028 14029 14030 14031 14032 14033 14034 14035 14036 14037 14038 14039 14040 14041 14042 14043 14044 14045 14046 14047 14048 14049 14050 14051 14052 14053 14054 14055 14056 14057 14058 14059 14060 14061 14062 14063 14064 14065 14066 14067 14068 14069 14070 14071 14072 14073 14074 14075 14076 14077 14078 14079 14080 14081 14082 14083 14084 14085 14086 14087 14088 14089 14090 14091 14092 14093 14094 14095 14096 14097 14098 14099 14100 14101 14102 14103 14104 14105 14106 14107 14108 14109 14110 14111 14112 14113 14114 14115 14116 14117 14118 14119 14120 14121 14122 14123 14124 14125 14126 14127 14128 14129 14130 14131 14132 14133 14134 14135 14136 14137 14138 14139 14140 14141 14142 14143 14144 14145 14146 14147 14148 14149 14150 14151 14152 14153 14154 14155 14156 14157 14158 14159 14160 14161 14162 14163 14164 14165 14166 14167 14168 14169 14170 14171 14172 14173 14174 14175 14176 14177 14178 14179 14180 14181 14182 14183 14184 14185 14186 14187 14188 14189 14190 14191 14192 14193 14194 14195 14196 14197 14198 14199 14200 14201 14202 14203 14204 14205 14206 14207 14208 14209 14210 14211 14212 14213 14214 14215 14216 14217 14218 14219 14220 14221 14222 14223 14224 14225 14226 14227 14228 14229 14230 14231 14232 14233 14234 14235 14236 14237 14238 14239 14240 14241 14242 14243 14244 14245 14246 14247 14248 14249 14250 14251 14252 14253 14254 14255 14256 14257 14258 14259 14260 14261 14262 14263 14264 14265 14266 14267 14268 14269 14270 14271 14272 14273 14274 14275 14276 14277 14278 14279 14280 14281 14282 14283 14284 14285 14286 14287 14288 14289 14290 14291 14292 14293 14294 14295 14296 14297 14298 14299 14300 14301 14302 14303 14304 14305 14306 14307 14308 14309 14310 14311 14312 14313 14314 14315 14316 14317 14318 14319 14320 14321 14322 14323 14324 14325 14326 14327 14328 14329 14330 14331 14332 14333 14334 14335 14336 14337 14338 14339 14340 14341 14342 14343 14344 14345 14346 14347 14348 14349 14350 14351 14352 14353 14354 14355 14356 14357 14358 14359 14360 14361 14362 14363 14364 14365 14366 14367 14368 14369 14370 14371 14372 14373 14374 14375 14376 14377 14378 14379 14380 14381 14382 14383 14384 14385 14386 14387 14388 14389 14390 14391 14392 14393 14394 14395 14396 14397 14398 14399 14400 14401 14402 14403 14404 14405 14406 14407 14408 14409 14410 14411 14412 14413 14414 14415 14416 14417 14418 14419 14420 14421 14422 14423 14424 14425 14426 14427 14428 14429 14430 14431 14432 14433 14434 14435 14436 14437 14438 14439 14440 14441 14442 14443 14444 14445 14446 14447 14448 14449 14450 14451 14452 14453 14454 14455 14456 14457 14458 14459 14460 14461 14462 14463 14464 14465 14466 14467 14468 14469 14470 14471 14472 14473 14474 14475 14476 14477 14478 14479 14480 14481 14482 14483 14484 14485 14486 14487 14488 14489 14490 14491 14492 14493 14494 14495 14496 14497 14498 14499 14500 14501 14502 14503 14504 14505 14506 14507 14508 14509 14510 14511 14512 14513 14514 14515 14516 14517 14518 14519 14520 14521 14522 14523 14524 14525 14526 14527 14528 14529 14530 14531 14532 14533 14534 14535 14536 14537 14538 14539 14540 14541 14542 14543 14544 14545 14546 14547 14548 14549 14550 14551 14552 14553 14554 14555 14556 14557 14558 14559 14560 14561 14562 14563 14564 14565 14566 14567 14568 14569 14570 14571 14572 14573 14574 14575 14576 14577 14578 14579 14580 14581 14582 14583 14584 14585 14586 14587 14588 14589 14590 14591 14592 14593 14594 14595 14596 14597 14598 14599 14600 14601 14602 14603 14604 14605 14606 14607 14608 14609 14610 14611 14612 14613 14614 14615 14616 14617 14618 14619 14620 14621 14622 14623 14624 14625 14626 14627 14628 14629 14630 14631 14632 14633 14634 14635 14636 14637 14638 14639
                  kidsOrNames = xref.fetchIfRef(kids[m]);
                  break;
                }
              }
              if (l > r) {
                return null;
              }
            }
    
            // If we get here, then we have found the right entry. Now
            // go through the named destinations in the Named dictionary
            // until we find the exact destination we're looking for.
            var names = kidsOrNames.get("Names");
            if (isArray(names)) {
              // Perform a binary search to reduce the lookup time.
              l = 0;
              r = names.length - 2;
              while (l <= r) {
                // Check only even indices (0, 2, 4, ...) because the
                // odd indices contain the actual D array.
                m = (l + r) & ~1;
                if (destinationId < xref.fetchIfRef(names[m])) {
                  r = m - 2;
                } else if (destinationId > xref.fetchIfRef(names[m])) {
                  l = m + 2;
                } else {
                  return xref.fetchIfRef(names[m + 1]);
                }
              }
            }
            return null;
          }
        };
        return NameTree;
      })();
    
      /**
       * "A PDF file can refer to the contents of another file by using a File
       * Specification (PDF 1.1)", see the spec (7.11) for more details.
       * NOTE: Only embedded files are supported (as part of the attachments support)
       * TODO: support the 'URL' file system (with caching if !/V), portable
       * collections attributes and related files (/RF)
       */
      var FileSpec = (function FileSpecClosure() {
        function FileSpec(root, xref) {
          if (!root || !isDict(root)) {
            return;
          }
          this.xref = xref;
          this.root = root;
          if (root.has("FS")) {
            this.fs = root.get("FS");
          }
          this.description = root.has("Desc")
            ? stringToPDFString(root.get("Desc"))
            : "";
          if (root.has("RF")) {
            warn("Related file specifications are not supported");
          }
          this.contentAvailable = true;
          if (!root.has("EF")) {
            this.contentAvailable = false;
            warn("Non-embedded file specifications are not supported");
          }
        }
    
        function pickPlatformItem(dict) {
          // Look for the filename in this order:
          // UF, F, Unix, Mac, DOS
          if (dict.has("UF")) {
            return dict.get("UF");
          } else if (dict.has("F")) {
            return dict.get("F");
          } else if (dict.has("Unix")) {
            return dict.get("Unix");
          } else if (dict.has("Mac")) {
            return dict.get("Mac");
          } else if (dict.has("DOS")) {
            return dict.get("DOS");
          } else {
            return null;
          }
        }
    
        FileSpec.prototype = {
          get filename() {
            if (!this._filename && this.root) {
              var filename = pickPlatformItem(this.root) || "unnamed";
              this._filename = stringToPDFString(filename)
                .replace(/\\\\/g, "\\")
                .replace(/\\\//g, "/")
                .replace(/\\/g, "/");
            }
            return this._filename;
          },
          get content() {
            if (!this.contentAvailable) {
              return null;
            }
            if (!this.contentRef && this.root) {
              this.contentRef = pickPlatformItem(this.root.get("EF"));
            }
            var content = null;
            if (this.contentRef) {
              var xref = this.xref;
              var fileObj = xref.fetchIfRef(this.contentRef);
              if (fileObj && isStream(fileObj)) {
                content = fileObj.getBytes();
              } else {
                warn(
                  "Embedded file specification points to non-existing/invalid " +
                    "content"
                );
              }
            } else {
              warn("Embedded file specification does not have a content");
            }
            return content;
          },
          get serializable() {
            return {
              filename: this.filename,
              content: this.content
            };
          }
        };
        return FileSpec;
      })();
    
      /**
       * A helper for loading missing data in object graphs. It traverses the graph
       * depth first and queues up any objects that have missing data. Once it has
       * has traversed as many objects that are available it attempts to bundle the
       * missing data requests and then resume from the nodes that weren't ready.
       *
       * NOTE: It provides protection from circular references by keeping track of
       * of loaded references. However, you must be careful not to load any graphs
       * that have references to the catalog or other pages since that will cause the
       * entire PDF document object graph to be traversed.
       */
      var ObjectLoader = (function() {
        function mayHaveChildren(value) {
          return isRef(value) || isDict(value) || isArray(value) || isStream(value);
        }
    
        function addChildren(node, nodesToVisit) {
          var value;
          if (isDict(node) || isStream(node)) {
            var map;
            if (isDict(node)) {
              map = node.map;
            } else {
              map = node.dict.map;
            }
            for (var key in map) {
              value = map[key];
              if (mayHaveChildren(value)) {
                nodesToVisit.push(value);
              }
            }
          } else if (isArray(node)) {
            for (var i = 0, ii = node.length; i < ii; i++) {
              value = node[i];
              if (mayHaveChildren(value)) {
                nodesToVisit.push(value);
              }
            }
          }
        }
    
        function ObjectLoader(obj, keys, xref) {
          this.obj = obj;
          this.keys = keys;
          this.xref = xref;
          this.refSet = null;
          this.capability = null;
        }
    
        ObjectLoader.prototype = {
          load: function ObjectLoader_load() {
            var keys = this.keys;
            this.capability = createPromiseCapability();
            // Don't walk the graph if all the data is already loaded.
            if (
              !(this.xref.stream instanceof ChunkedStream) ||
              this.xref.stream.getMissingChunks().length === 0
            ) {
              this.capability.resolve();
              return this.capability.promise;
            }
    
            this.refSet = new RefSet();
            // Setup the initial nodes to visit.
            var nodesToVisit = [];
            for (var i = 0; i < keys.length; i++) {
              nodesToVisit.push(this.obj[keys[i]]);
            }
    
            this._walk(nodesToVisit);
            return this.capability.promise;
          },
    
          _walk: function ObjectLoader_walk(nodesToVisit) {
            var nodesToRevisit = [];
            var pendingRequests = [];
            // DFS walk of the object graph.
            while (nodesToVisit.length) {
              var currentNode = nodesToVisit.pop();
    
              // Only references or chunked streams can cause missing data exceptions.
              if (isRef(currentNode)) {
                // Skip nodes that have already been visited.
                if (this.refSet.has(currentNode)) {
                  continue;
                }
                try {
                  var ref = currentNode;
                  this.refSet.put(ref);
                  currentNode = this.xref.fetch(currentNode);
                } catch (e) {
                  if (!(e instanceof MissingDataException)) {
                    throw e;
                  }
                  nodesToRevisit.push(currentNode);
                  pendingRequests.push({ begin: e.begin, end: e.end });
                }
              }
              if (currentNode && currentNode.getBaseStreams) {
                var baseStreams = currentNode.getBaseStreams();
                var foundMissingData = false;
                for (var i = 0; i < baseStreams.length; i++) {
                  var stream = baseStreams[i];
                  if (stream.getMissingChunks && stream.getMissingChunks().length) {
                    foundMissingData = true;
                    pendingRequests.push({
                      begin: stream.start,
                      end: stream.end
                    });
                  }
                }
                if (foundMissingData) {
                  nodesToRevisit.push(currentNode);
                }
              }
    
              addChildren(currentNode, nodesToVisit);
            }
    
            if (pendingRequests.length) {
              this.xref.stream.manager.requestRanges(pendingRequests).then(
                function pendingRequestCallback() {
                  nodesToVisit = nodesToRevisit;
                  for (var i = 0; i < nodesToRevisit.length; i++) {
                    var node = nodesToRevisit[i];
                    // Remove any reference nodes from the currrent refset so they
                    // aren't skipped when we revist them.
                    if (isRef(node)) {
                      this.refSet.remove(node);
                    }
                  }
                  this._walk(nodesToVisit);
                }.bind(this),
                this.capability.reject
              );
              return;
            }
            // Everything is loaded.
            this.refSet = null;
            this.capability.resolve();
          }
        };
    
        return ObjectLoader;
      })();
    
      exports.Catalog = Catalog;
      exports.ObjectLoader = ObjectLoader;
      exports.XRef = XRef;
    });
    
    /* Copyright 2012 Mozilla Foundation
     *
     * Licensed under the Apache License, Version 2.0 (the "License");
     * you may not use this file except in compliance with the License.
     * You may obtain a copy of the License at
     *
     *     http://www.apache.org/licenses/LICENSE-2.0
     *
     * Unless required by applicable law or agreed to in writing, software
     * distributed under the License is distributed on an "AS IS" BASIS,
     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     * See the License for the specific language governing permissions and
     * limitations under the License.
     */
    
    ("use strict");
    
    (function(root, factory) {
      //if (typeof define === 'function' && define.amd) {
      //  define('pdfjs/core/document', ['exports', 'pdfjs/shared/util',
      //    'pdfjs/core/primitives', 'pdfjs/core/stream', 'pdfjs/core/obj',
      //    'pdfjs/core/parser', 'pdfjs/core/crypto'], factory);
      // } else if (typeof exports !== 'undefined') {
      //   factory(exports, require('../shared/util.js'), require('./primitives.js'),
      //     require('./stream.js'), require('./obj.js'), require('./parser.js'),
      //     require('./crypto.js'));
      //} else {
      factory(
        (root.pdfjsCoreDocument = {}),
        root.pdfjsSharedUtil,
        root.pdfjsCorePrimitives,
        root.pdfjsCoreStream,
        root.pdfjsCoreObj,
        root.pdfjsCoreParser,
        root.pdfjsCoreCrypto
      );
      //}
    })(window, function(
      exports,
      sharedUtil,
      corePrimitives,
      coreStream,
      coreObj,
      coreParser,
      coreCrypto
    ) {
      var MissingDataException = sharedUtil.MissingDataException;
      var Util = sharedUtil.Util;
      var assert = sharedUtil.assert;
      var error = sharedUtil.error;
      var info = sharedUtil.info;
      var isArray = sharedUtil.isArray;
      var isArrayBuffer = sharedUtil.isArrayBuffer;
      var isString = sharedUtil.isString;
      var shadow = sharedUtil.shadow;
      var stringToBytes = sharedUtil.stringToBytes;
      var stringToPDFString = sharedUtil.stringToPDFString;
      var warn = sharedUtil.warn;
      var Dict = corePrimitives.Dict;
      var isDict = corePrimitives.isDict;
      var isName = corePrimitives.isName;
      var isStream = corePrimitives.isStream;
      var NullStream = coreStream.NullStream;
      var Stream = coreStream.Stream;
      var StreamsSequenceStream = coreStream.StreamsSequenceStream;
      var Catalog = coreObj.Catalog;
      var ObjectLoader = coreObj.ObjectLoader;
      var XRef = coreObj.XRef;
      var Lexer = coreParser.Lexer;
      var Linearization = coreParser.Linearization;
      var calculateMD5 = coreCrypto.calculateMD5;
    
      /**
       * The `PDFDocument` holds all the data of the PDF file. Compared to the
       * `PDFDoc`, this one doesn't have any job management code.
       * Right now there exists one PDFDocument on the main thread + one object
       * for each worker. If there is no worker support enabled, there are two
       * `PDFDocument` objects on the main thread created.
       */
      var PDFDocument = (function PDFDocumentClosure() {
        var FINGERPRINT_FIRST_BYTES = 1024;
        var EMPTY_FINGERPRINT =
          "\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00";
    
        function PDFDocument(pdfManager, arg, password) {
          if (isStream(arg)) {
            init.call(this, pdfManager, arg, password);
          } else if (isArrayBuffer(arg)) {
            init.call(this, pdfManager, new Stream(arg), password);
          } else {
            error("PDFDocument: Unknown argument type");
          }
        }
    
        function init(pdfManager, stream, password) {
          assert(stream.length > 0, "stream must have data");
          this.pdfManager = pdfManager;
          this.stream = stream;
          var xref = new XRef(this.stream, password, pdfManager);
          this.xref = xref;
        }
    
        function find(stream, needle, limit, backwards) {
          var pos = stream.pos;
          var end = stream.end;
          var strBuf = [];
          if (pos + limit > end) {
            limit = end - pos;
          }
          for (var n = 0; n < limit; ++n) {
            strBuf.push(String.fromCharCode(stream.getByte()));
          }
          var str = strBuf.join("");
          stream.pos = pos;
          var index = backwards ? str.lastIndexOf(needle) : str.indexOf(needle);
          if (index === -1) {
            return false; /* not found */
          }
          stream.pos += index;
          return true; /* found */
        }
    
        var DocumentInfoValidators = {
          get entries() {
            // Lazily build this since all the validation functions below are not
            // defined until after this file loads.
            return shadow(this, "entries", {
              Title: isString,
              Author: isString,
              Subject: isString,
              Keywords: isString,
              Creator: isString,
              Producer: isString,
              CreationDate: isString,
              ModDate: isString,
              Trapped: isName
            });
          }
        };
    
        PDFDocument.prototype = {
          parse: function PDFDocument_parse(recoveryMode) {
            this.setup(recoveryMode);
            var version = this.catalog.catDict.get("Version");
            if (isName(version)) {
              this.pdfFormatVersion = version.name;
            }
            try {
              // checking if AcroForm is present
              this.acroForm = this.catalog.catDict.get("AcroForm");
              if (this.acroForm) {
                this.xfa = this.acroForm.get("XFA");
                var fields = this.acroForm.get("Fields");
                if (
                  (!fields || !isArray(fields) || fields.length === 0) &&
                  !this.xfa
                ) {
                  // no fields and no XFA -- not a form (?)
                  this.acroForm = null;
                }
              }
            } catch (ex) {
              info("Something wrong with AcroForm entry");
              this.acroForm = null;
            }
          },
    
          get linearization() {
            var linearization = null;
            if (this.stream.length) {
              try {
                linearization = Linearization.create(this.stream);
              } catch (err) {
                if (err instanceof MissingDataException) {
                  throw err;
                }
                info(err);
              }
            }
            // shadow the prototype getter with a data property
            return shadow(this, "linearization", linearization);
          },
          get startXRef() {
            var stream = this.stream;
            var startXRef = 0;
            var linearization = this.linearization;
            if (linearization) {
              // Find end of first obj.
              stream.reset();
              if (find(stream, "endobj", 1024)) {
                startXRef = stream.pos + 6;
              }
            } else {
              // Find startxref by jumping backward from the end of the file.
              var step = 1024;
              var found = false,
                pos = stream.end;
              while (!found && pos > 0) {
                pos -= step - "startxref".length;
                if (pos < 0) {
                  pos = 0;
                }
                stream.pos = pos;
                found = find(stream, "startxref", step, true);
              }
              if (found) {
                stream.skip(9);
                var ch;
                do {
                  ch = stream.getByte();
                } while (Lexer.isSpace(ch));
                var str = "";
                while (ch >= 0x20 && ch <= 0x39) {
                  // < '9'
                  str += String.fromCharCode(ch);
                  ch = stream.getByte();
                }
                startXRef = parseInt(str, 10);
                if (isNaN(startXRef)) {
                  startXRef = 0;
                }
              }
            }
            // shadow the prototype getter with a data property
            return shadow(this, "startXRef", startXRef);
          },
          get mainXRefEntriesOffset() {
            var mainXRefEntriesOffset = 0;
            var linearization = this.linearization;
            if (linearization) {
              mainXRefEntriesOffset = linearization.mainXRefEntriesOffset;
            }
            // shadow the prototype getter with a data property
            return shadow(this, "mainXRefEntriesOffset", mainXRefEntriesOffset);
          },
          // Find the header, remove leading garbage and setup the stream
          // starting from the header.
          checkHeader: function PDFDocument_checkHeader() {
            var stream = this.stream;
            stream.reset();
            if (find(stream, "%PDF-", 1024)) {
              // Found the header, trim off any garbage before it.
              stream.moveStart();
              // Reading file format version
              var MAX_VERSION_LENGTH = 12;
              var version = "",
                ch;
              while ((ch = stream.getByte()) > 0x20) {
                // SPACE
                if (version.length >= MAX_VERSION_LENGTH) {
                  break;
                }
                version += String.fromCharCode(ch);
              }
              if (!this.pdfFormatVersion) {
                // removing "%PDF-"-prefix
                this.pdfFormatVersion = version.substring(5);
              }
              return;
            }
            // May not be a PDF file, continue anyway.
          },
          parseStartXRef: function PDFDocument_parseStartXRef() {
            var startXRef = this.startXRef;
            this.xref.setStartXRef(startXRef);
          },
          setup: function PDFDocument_setup(recoveryMode) {
            this.xref.parse(recoveryMode);
            var self = this;
            this.catalog = new Catalog(this.pdfManager, this.xref, false);
          },
          get numPages() {
            var linearization = this.linearization;
            var num = linearization
              ? linearization.numPages
              : this.catalog.numPages;
            // shadow the prototype getter
            return shadow(this, "numPages", num);
          },
          get documentInfo() {
            var docInfo = {
              PDFFormatVersion: this.pdfFormatVersion,
              IsAcroFormPresent: !!this.acroForm,
              IsXFAPresent: !!this.xfa
            };
            var infoDict;
            try {
              infoDict = this.xref.trailer.get("Info");
            } catch (err) {
              info("The document information dictionary is invalid.");
            }
            if (infoDict) {
              var validEntries = DocumentInfoValidators.entries;
              // Only fill the document info with valid entries from the spec.
              for (var key in validEntries) {
                if (infoDict.has(key)) {
                  var value = infoDict.get(key);
                  // Make sure the value conforms to the spec.
                  if (validEntries[key](value)) {
                    docInfo[key] =
                      typeof value !== "string" ? value : stringToPDFString(value);
                  } else {
                    info('Bad value in document info for "' + key + '"');
                  }
                }
              }
            }
            return shadow(this, "documentInfo", docInfo);
          },
          get fingerprint() {
            var xref = this.xref,
              hash,
              fileID = "";
            var idArray = xref.trailer.get("ID");
    
            if (
              idArray &&
              isArray(idArray) &&
              idArray[0] &&
              isString(idArray[0]) &&
              idArray[0] !== EMPTY_FINGERPRINT
            ) {
              hash = stringToBytes(idArray[0]);
            } else {
              if (this.stream.ensureRange) {
                this.stream.ensureRange(
                  0,
                  Math.min(FINGERPRINT_FIRST_BYTES, this.stream.end)
                );
              }
              hash = calculateMD5(
                this.stream.bytes.subarray(0, FINGERPRINT_FIRST_BYTES),
                0,
                FINGERPRINT_FIRST_BYTES
              );
            }
    
            for (var i = 0, n = hash.length; i < n; i++) {
              var hex = hash[i].toString(16);
              fileID += hex.length === 1 ? "0" + hex : hex;
            }
    
            return shadow(this, "fingerprint", fileID);
          },
    
          getPage: function PDFDocument_getPage(pageIndex) {
            return this.catalog.getPage(pageIndex);
          },
    
          cleanup: function PDFDocument_cleanup() {
            return this.catalog.cleanup();
          }
        };
    
        return PDFDocument;
      })();
    
      exports.PDFDocument = PDFDocument;
    });