From 568f876b344a19f369d2a40a9108d139c0e433f1 Mon Sep 17 00:00:00 2001 From: Martchus Date: Tue, 29 Aug 2017 01:29:27 +0200 Subject: [PATCH] Improve performance when parsing big OGG files by skipping pages in the middle (unless a full parse is forced). Additionally, the size of the tracks is now determined on container-level which makes handling the skipping easier. --- ogg/oggcontainer.cpp | 41 +++++++++++++++++++++---- ogg/oggiterator.cpp | 73 ++++++++++++++++++++++++++++++++++++++++++-- ogg/oggiterator.h | 20 ++++++++++-- ogg/oggpage.h | 11 ++++++- ogg/oggstream.cpp | 5 +-- 5 files changed, 134 insertions(+), 16 deletions(-) diff --git a/ogg/oggcontainer.cpp b/ogg/oggcontainer.cpp index 23081ec..b52ee9c 100644 --- a/ogg/oggcontainer.cpp +++ b/ogg/oggcontainer.cpp @@ -186,24 +186,27 @@ void OggContainer::removeAllTags() void OggContainer::internalParseHeader() { static const string context("parsing OGG bitstream header"); + bool pagesSkipped = false; + // iterate through pages using OggIterator helper class try { // ensure iterator is setup properly for(m_iterator.removeFilter(), m_iterator.reset(); m_iterator; m_iterator.nextPage()) { const OggPage &page = m_iterator.currentPage(); - if(m_validateChecksums) { - if(page.checksum() != OggPage::computeChecksum(stream(), page.startOffset())) { - addNotification(NotificationType::Warning, "The denoted checksum of the OGG page at " % ConversionUtilities::numberToString(m_iterator.currentSegmentOffset()) + " does not match the computed checksum.", context); - } + if(m_validateChecksums && page.checksum() != OggPage::computeChecksum(stream(), page.startOffset())) { + addNotification(NotificationType::Warning, argsToString("The denoted checksum of the OGG page at ", m_iterator.currentSegmentOffset(), " does not match the computed checksum."), context); } OggStream *stream; + uint64 lastNewStreamOffset = 0; try { stream = m_tracks[m_streamsBySerialNo.at(page.streamSerialNumber())].get(); + stream->m_size += page.dataSize(); } catch(const out_of_range &) { // new stream serial number recognized -> add new stream m_streamsBySerialNo[page.streamSerialNumber()] = m_tracks.size(); m_tracks.emplace_back(make_unique(*this, m_iterator.currentPageIndex())); stream = m_tracks.back().get(); + lastNewStreamOffset = page.startOffset(); } if(stream->m_currentSequenceNumber != page.sequenceNumber()) { if(stream->m_currentSequenceNumber) { @@ -213,13 +216,39 @@ void OggContainer::internalParseHeader() } else { ++stream->m_currentSequenceNumber; } + + // skip pages in the middle of a big file (still more than 100 MiB to parse) if no new track has been seen since the last 20 MiB + if(!fileInfo().isForcingFullParse() + && (fileInfo().size() - page.startOffset()) > (100 * 0x100000) + && (page.startOffset() - lastNewStreamOffset) > (20 * 0x100000)) { + if(m_iterator.resyncAt(fileInfo().size() - (20 * 0x100000))) { + const OggPage &resyncedPage = m_iterator.currentPage(); + // prevent warning about missing pages + stream->m_currentSequenceNumber = resyncedPage.sequenceNumber() + 1; + pagesSkipped = true; + addNotification(NotificationType::Information, + argsToString("Pages in the middle of the file (", dataSizeToString(resyncedPage.startOffset() - page.startOffset()) ,") have been skipped to improve parsing speed. Hence track sizes can not be computed. Maybe not even all tracks could be detected. Force a full parse to prevent this."), + context); + } else { + // abort if skipping pages didn't work + addNotification(NotificationType::Critical, "Unable to re-sync after skipping OGG pages in the middle of the file. Try forcing a full parse.", context); + return; + } + } } } catch(const TruncatedDataException &) { // thrown when page exceeds max size addNotification(NotificationType::Critical, "The OGG file is truncated.", context); } catch(const InvalidDataException &) { // thrown when first 4 byte do not match capture pattern - addNotification(NotificationType::Critical, "Capture pattern \"OggS\" at " % numberToString(m_iterator.currentSegmentOffset()) + " expected.", context); + addNotification(NotificationType::Critical, argsToString("Capture pattern \"OggS\" at ", m_iterator.currentSegmentOffset(), " expected."), context); + } + + // invalidate stream sizes in case pages have been skipped + if(pagesSkipped) { + for(auto &stream : m_tracks) { + stream->m_size = 0; + } } } @@ -279,7 +308,7 @@ void OggContainer::internalParseTracks() m_duration = stream->duration(); } } catch(const Failure &) { - addNotification(NotificationType::Critical, "Unable to parse stream at " % numberToString(stream->startOffset()) + ".", context); + addNotification(NotificationType::Critical, argsToString("Unable to parse stream at ", stream->startOffset(), '.'), context); } } } diff --git a/ogg/oggiterator.cpp b/ogg/oggiterator.cpp index 1bf16f6..24d628f 100644 --- a/ogg/oggiterator.cpp +++ b/ogg/oggiterator.cpp @@ -2,9 +2,13 @@ #include "../exceptions.h" +#include + #include +#include using namespace std; +using namespace IoUtilities; namespace Media { @@ -155,7 +159,9 @@ void OggIterator::read(char *buffer, size_t count) * \remarks * - Might increase the current page index and/or the current segment index. * - Page headers are skipped (this is the whole purpose of this method). - * - Does not read more than \a max bytes from the buffer. + * - Does not write more than \a max bytes to the buffer. + * \returns Returns the number of bytes read from the OGG stream. This might be less than \a max in + * case not that many bytes were available. * \sa read() * \sa currentCharacterOffset() * \sa seekForward() @@ -206,6 +212,68 @@ void OggIterator::ignore(size_t count) throw TruncatedDataException(); } +/*! + * \brief Fetches the next page at the specified \a offset. + * + * This allows to omit parts of a file which is useful to + * - find the last page faster by skipping pages in the middle (last page is required for calculating + * the files duration). + * - recover parsing after after an error occured. + * + * Regardless of the current iterator position, this method will assume the page at \a offset comes after + * the last known page. Hence \a offset must be greather than OggPage::startOffset() + OggPage::totalSize() of the + * last known page. This is checked by the method. + * + * If the OGG capture pattern is not present at \a offset, up to 65307 bytes (max. size of an OGG page) are + * skipped. So in a valid stream, this method will always succeed if \a offset is less than the stream size minus + * 65307. + * + * If a page could be found, it is appended to pages() and the iterator position is set to the first segment of + * that page. If no page could be found, this method does not alter the iterator. + * + * \returns Returns an indication whether a page could be found. + * \throws Throws std::ios_base::failure when an IO error occurs. + * \throws Throws Failure when a parsing error occurs. + */ +bool OggIterator::resyncAt(uint64 offset) +{ + // check whether offset is valid + if(offset >= streamSize() || offset < (m_pages.empty() ? m_startOffset : m_pages.back().startOffset() + m_pages.back().totalSize())) { + return false; + } + + // find capture pattern 'OggS' + stream().seekg(offset); + byte lettersFound = 0; + for(uint64 bytesAvailable = max(streamSize() - offset, 65307ul); bytesAvailable >= 27; --bytesAvailable) { + switch(static_cast(stream().get())) { + case 'O': + lettersFound = 1; + break; + case 'g': + lettersFound = lettersFound == 1 || lettersFound == 2 ? lettersFound + 1 : 0; + break; + case 'S': + if(lettersFound == 3) { + // capture pattern found + const auto currentOffset = stream().tellg(); + // -> try to parse an OGG page at this position + try { + m_pages.emplace_back(stream(), static_cast(stream().tellg()) - 4, bytesAvailable > numeric_limits::max() ? numeric_limits::max() : static_cast(bytesAvailable)); + setPageIndex(m_pages.size() - 1); + return true; + } catch (const Failure &) { + stream().seekg(currentOffset); + } + } + FALLTHROUGH; + default: + lettersFound = 0; + } + } + return false; +} + /*! * \brief Fetches the next page. * @@ -221,7 +289,8 @@ bool OggIterator::fetchNextPage() if(m_page == m_pages.size()) { // can only fetch the next page if the current page is the last page m_offset = m_pages.empty() ? m_startOffset : m_pages.back().startOffset() + m_pages.back().totalSize(); if(m_offset < m_streamSize) { - m_pages.emplace_back(*m_stream, m_offset, static_cast(m_streamSize - m_offset)); + const uint64 bytesAvailable = m_streamSize - m_offset; + m_pages.emplace_back(*m_stream, m_offset, bytesAvailable > numeric_limits::max() ? numeric_limits::max() : static_cast(bytesAvailable)); return true; } } diff --git a/ogg/oggiterator.h b/ogg/oggiterator.h index 475ea9d..479dc64 100644 --- a/ogg/oggiterator.h +++ b/ogg/oggiterator.h @@ -25,6 +25,7 @@ public: void previousSegment(); const std::vector &pages() const; const OggPage ¤tPage() const; + uint64 currentPageOffset() const; std::vector::size_type currentPageIndex() const; void setPageIndex(std::vector::size_type index); void setSegmentIndex(std::vector::size_type index); @@ -40,6 +41,7 @@ public: size_t readAll(char *buffer, std::size_t max); void ignore(std::size_t count = 1); bool bytesRemaining(std::size_t atLeast) const; + bool resyncAt(uint64 offset); operator bool() const; OggIterator &operator++(); @@ -131,6 +133,15 @@ inline const OggPage &OggIterator::currentPage() const return m_pages[m_page]; } +/*! + * \brief Returns the start offset of the current OGG page. + * \remarks Calling this method when the iterator is invalid causes undefined behaviour. + */ +inline uint64 OggIterator::currentPageOffset() const +{ + return m_pages[m_page].startOffset(); +} + /*! * \brief Returns an indication whether the iterator is valid. * @@ -139,7 +150,7 @@ inline const OggPage &OggIterator::currentPage() const * * If the iterator is invalid, it can be reseted using the reset() method. * - * Some methods might cause undefined behaviour if called on an invalid iterator. + * Some methods cause undefined behaviour if called on an invalid iterator. */ inline OggIterator::operator bool() const { @@ -156,8 +167,7 @@ inline std::vector::size_type OggIterator::currentPageIndex() const /*! * \brief Sets the current page index. - * - * This method should never be called with an \a index out of range (which is the defined by the number of fetched pages), since this causes undefined behaviour. + * \remarks This method should never be called with an \a index out of range (which is defined by the number of fetched pages), since this would cause undefined behaviour. */ inline void OggIterator::setPageIndex(std::vector::size_type index) { @@ -250,6 +260,10 @@ inline void OggIterator::removeFilter() * This means that for each page in the stream in the specified range (stream and range have been specified when * constructing the iterator) an OggPage instance has been created and pushed to pages(). This is independend from * the current iterator position. Fetched pages remain after resetting the iterator. + * + * \remarks This is also true if pages in the middle of the file have been omitted because it is actually just checked + * whether the last page has been fetched. + * \todo Rename to isLastPageFetched() in next major release. */ inline bool OggIterator::areAllPagesFetched() const { diff --git a/ogg/oggpage.h b/ogg/oggpage.h index a07c769..2ad18a8 100644 --- a/ogg/oggpage.h +++ b/ogg/oggpage.h @@ -35,6 +35,7 @@ public: byte segmentTableSize() const; const std::vector &segmentSizes() const; uint32 headerSize() const; + uint32 dataSize() const; uint32 totalSize() const; uint64 dataOffset(byte segmentIndex = 0) const; static uint32 makeSegmentSizeDenotation(std::ostream &stream, uint32 size); @@ -221,12 +222,20 @@ inline uint32 OggPage::headerSize() const return 27 + m_segmentCount; } +/*! + * \brief Returns the data size in byte. + */ +inline uint32 OggPage::dataSize() const +{ + return std::accumulate(m_segmentSizes.cbegin(), m_segmentSizes.cend(), 0u); +} + /*! * \brief Returns the total size of the page in byte. */ inline uint32 OggPage::totalSize() const { - return headerSize() + std::accumulate(m_segmentSizes.cbegin(), m_segmentSizes.cend(), 0); + return headerSize() + dataSize(); } /*! diff --git a/ogg/oggstream.cpp b/ogg/oggstream.cpp index 52547f8..4beb685 100644 --- a/ogg/oggstream.cpp +++ b/ogg/oggstream.cpp @@ -62,11 +62,8 @@ void OggStream::internalParseHeader() const auto pred = bind(&OggPage::matchesStreamSerialNumber, _1, firstPage.streamSerialNumber()); // iterate through segments using OggIterator - // -> iterate through ALL segments to calculate the precise stream size (hence the out-commented part in the loop-condition) - for(bool hasIdentificationHeader = false, hasCommentHeader = false; iterator /* && (!hasIdentificationHeader && !hasCommentHeader) */; ++iterator) { + for(bool hasIdentificationHeader = false, hasCommentHeader = false; iterator && (!hasIdentificationHeader || !hasCommentHeader); ++iterator) { const uint32 currentSize = iterator.currentSegmentSize(); - m_size += currentSize; - if(currentSize >= 8) { // determine stream format inputStream().seekg(iterator.currentSegmentOffset());