Improve performance when parsing big OGG files

by skipping pages in the middle (unless a full parse
is forced).

Additionally, the size of the tracks is now determined
on container-level which makes handling the skipping
This commit is contained in:
Martchus 2017-08-29 01:29:27 +02:00
parent 7ab83a55ec
commit 568f876b34
5 changed files with 134 additions and 16 deletions

View File

@ -186,24 +186,27 @@ void OggContainer::removeAllTags()
void OggContainer::internalParseHeader()
static const string context("parsing OGG bitstream header");
bool pagesSkipped = false;
// iterate through pages using OggIterator helper class
try {
// ensure iterator is setup properly
for(m_iterator.removeFilter(), m_iterator.reset(); m_iterator; m_iterator.nextPage()) {
const OggPage &page = m_iterator.currentPage();
if(m_validateChecksums) {
if(page.checksum() != OggPage::computeChecksum(stream(), page.startOffset())) {
addNotification(NotificationType::Warning, "The denoted checksum of the OGG page at " % ConversionUtilities::numberToString(m_iterator.currentSegmentOffset()) + " does not match the computed checksum.", context);
if(m_validateChecksums && page.checksum() != OggPage::computeChecksum(stream(), page.startOffset())) {
addNotification(NotificationType::Warning, argsToString("The denoted checksum of the OGG page at ", m_iterator.currentSegmentOffset(), " does not match the computed checksum."), context);
OggStream *stream;
uint64 lastNewStreamOffset = 0;
try {
stream = m_tracks[].get();
stream->m_size += page.dataSize();
} catch(const out_of_range &) {
// new stream serial number recognized -> add new stream
m_streamsBySerialNo[page.streamSerialNumber()] = m_tracks.size();
m_tracks.emplace_back(make_unique<OggStream>(*this, m_iterator.currentPageIndex()));
stream = m_tracks.back().get();
lastNewStreamOffset = page.startOffset();
if(stream->m_currentSequenceNumber != page.sequenceNumber()) {
if(stream->m_currentSequenceNumber) {
@ -213,13 +216,39 @@ void OggContainer::internalParseHeader()
} else {
// skip pages in the middle of a big file (still more than 100 MiB to parse) if no new track has been seen since the last 20 MiB
&& (fileInfo().size() - page.startOffset()) > (100 * 0x100000)
&& (page.startOffset() - lastNewStreamOffset) > (20 * 0x100000)) {
if(m_iterator.resyncAt(fileInfo().size() - (20 * 0x100000))) {
const OggPage &resyncedPage = m_iterator.currentPage();
// prevent warning about missing pages
stream->m_currentSequenceNumber = resyncedPage.sequenceNumber() + 1;
pagesSkipped = true;
argsToString("Pages in the middle of the file (", dataSizeToString(resyncedPage.startOffset() - page.startOffset()) ,") have been skipped to improve parsing speed. Hence track sizes can not be computed. Maybe not even all tracks could be detected. Force a full parse to prevent this."),
} else {
// abort if skipping pages didn't work
addNotification(NotificationType::Critical, "Unable to re-sync after skipping OGG pages in the middle of the file. Try forcing a full parse.", context);
} catch(const TruncatedDataException &) {
// thrown when page exceeds max size
addNotification(NotificationType::Critical, "The OGG file is truncated.", context);
} catch(const InvalidDataException &) {
// thrown when first 4 byte do not match capture pattern
addNotification(NotificationType::Critical, "Capture pattern \"OggS\" at " % numberToString(m_iterator.currentSegmentOffset()) + " expected.", context);
addNotification(NotificationType::Critical, argsToString("Capture pattern \"OggS\" at ", m_iterator.currentSegmentOffset(), " expected."), context);
// invalidate stream sizes in case pages have been skipped
if(pagesSkipped) {
for(auto &stream : m_tracks) {
stream->m_size = 0;
@ -279,7 +308,7 @@ void OggContainer::internalParseTracks()
m_duration = stream->duration();
} catch(const Failure &) {
addNotification(NotificationType::Critical, "Unable to parse stream at " % numberToString(stream->startOffset()) + ".", context);
addNotification(NotificationType::Critical, argsToString("Unable to parse stream at ", stream->startOffset(), '.'), context);

View File

@ -2,9 +2,13 @@
#include "../exceptions.h"
#include <c++utilities/io/binaryreader.h>
#include <iostream>
#include <limits>
using namespace std;
using namespace IoUtilities;
namespace Media {
@ -155,7 +159,9 @@ void OggIterator::read(char *buffer, size_t count)
* \remarks
* - Might increase the current page index and/or the current segment index.
* - Page headers are skipped (this is the whole purpose of this method).
* - Does not read more than \a max bytes from the buffer.
* - Does not write more than \a max bytes to the buffer.
* \returns Returns the number of bytes read from the OGG stream. This might be less than \a max in
* case not that many bytes were available.
* \sa read()
* \sa currentCharacterOffset()
* \sa seekForward()
@ -206,6 +212,68 @@ void OggIterator::ignore(size_t count)
throw TruncatedDataException();
* \brief Fetches the next page at the specified \a offset.
* This allows to omit parts of a file which is useful to
* - find the last page faster by skipping pages in the middle (last page is required for calculating
* the files duration).
* - recover parsing after after an error occured.
* Regardless of the current iterator position, this method will assume the page at \a offset comes after
* the last known page. Hence \a offset must be greather than OggPage::startOffset() + OggPage::totalSize() of the
* last known page. This is checked by the method.
* If the OGG capture pattern is not present at \a offset, up to 65307 bytes (max. size of an OGG page) are
* skipped. So in a valid stream, this method will always succeed if \a offset is less than the stream size minus
* 65307.
* If a page could be found, it is appended to pages() and the iterator position is set to the first segment of
* that page. If no page could be found, this method does not alter the iterator.
* \returns Returns an indication whether a page could be found.
* \throws Throws std::ios_base::failure when an IO error occurs.
* \throws Throws Failure when a parsing error occurs.
bool OggIterator::resyncAt(uint64 offset)
// check whether offset is valid
if(offset >= streamSize() || offset < (m_pages.empty() ? m_startOffset : m_pages.back().startOffset() + m_pages.back().totalSize())) {
return false;
// find capture pattern 'OggS'
byte lettersFound = 0;
for(uint64 bytesAvailable = max<uint64>(streamSize() - offset, 65307ul); bytesAvailable >= 27; --bytesAvailable) {
switch(static_cast<char>(stream().get())) {
case 'O':
lettersFound = 1;
case 'g':
lettersFound = lettersFound == 1 || lettersFound == 2 ? lettersFound + 1 : 0;
case 'S':
if(lettersFound == 3) {
// capture pattern found
const auto currentOffset = stream().tellg();
// -> try to parse an OGG page at this position
try {
m_pages.emplace_back(stream(), static_cast<uint64>(stream().tellg()) - 4, bytesAvailable > numeric_limits<int32>::max() ? numeric_limits<int32>::max() : static_cast<int32>(bytesAvailable));
setPageIndex(m_pages.size() - 1);
return true;
} catch (const Failure &) {
lettersFound = 0;
return false;
* \brief Fetches the next page.
@ -221,7 +289,8 @@ bool OggIterator::fetchNextPage()
if(m_page == m_pages.size()) { // can only fetch the next page if the current page is the last page
m_offset = m_pages.empty() ? m_startOffset : m_pages.back().startOffset() + m_pages.back().totalSize();
if(m_offset < m_streamSize) {
m_pages.emplace_back(*m_stream, m_offset, static_cast<int32>(m_streamSize - m_offset));
const uint64 bytesAvailable = m_streamSize - m_offset;
m_pages.emplace_back(*m_stream, m_offset, bytesAvailable > numeric_limits<int32>::max() ? numeric_limits<int32>::max() : static_cast<int32>(bytesAvailable));
return true;

View File

@ -25,6 +25,7 @@ public:
void previousSegment();
const std::vector<OggPage> &pages() const;
const OggPage &currentPage() const;
uint64 currentPageOffset() const;
std::vector<OggPage>::size_type currentPageIndex() const;
void setPageIndex(std::vector<OggPage>::size_type index);
void setSegmentIndex(std::vector<uint32>::size_type index);
@ -40,6 +41,7 @@ public:
size_t readAll(char *buffer, std::size_t max);
void ignore(std::size_t count = 1);
bool bytesRemaining(std::size_t atLeast) const;
bool resyncAt(uint64 offset);
operator bool() const;
OggIterator &operator++();
@ -131,6 +133,15 @@ inline const OggPage &OggIterator::currentPage() const
return m_pages[m_page];
* \brief Returns the start offset of the current OGG page.
* \remarks Calling this method when the iterator is invalid causes undefined behaviour.
inline uint64 OggIterator::currentPageOffset() const
return m_pages[m_page].startOffset();
* \brief Returns an indication whether the iterator is valid.
@ -139,7 +150,7 @@ inline const OggPage &OggIterator::currentPage() const
* If the iterator is invalid, it can be reseted using the reset() method.
* Some methods might cause undefined behaviour if called on an invalid iterator.
* Some methods cause undefined behaviour if called on an invalid iterator.
inline OggIterator::operator bool() const
@ -156,8 +167,7 @@ inline std::vector<OggPage>::size_type OggIterator::currentPageIndex() const
* \brief Sets the current page index.
* This method should never be called with an \a index out of range (which is the defined by the number of fetched pages), since this causes undefined behaviour.
* \remarks This method should never be called with an \a index out of range (which is defined by the number of fetched pages), since this would cause undefined behaviour.
inline void OggIterator::setPageIndex(std::vector<OggPage>::size_type index)
@ -250,6 +260,10 @@ inline void OggIterator::removeFilter()
* This means that for each page in the stream in the specified range (stream and range have been specified when
* constructing the iterator) an OggPage instance has been created and pushed to pages(). This is independend from
* the current iterator position. Fetched pages remain after resetting the iterator.
* \remarks This is also true if pages in the middle of the file have been omitted because it is actually just checked
* whether the last page has been fetched.
* \todo Rename to isLastPageFetched() in next major release.
inline bool OggIterator::areAllPagesFetched() const

View File

@ -35,6 +35,7 @@ public:
byte segmentTableSize() const;
const std::vector<uint32> &segmentSizes() const;
uint32 headerSize() const;
uint32 dataSize() const;
uint32 totalSize() const;
uint64 dataOffset(byte segmentIndex = 0) const;
static uint32 makeSegmentSizeDenotation(std::ostream &stream, uint32 size);
@ -221,12 +222,20 @@ inline uint32 OggPage::headerSize() const
return 27 + m_segmentCount;
* \brief Returns the data size in byte.
inline uint32 OggPage::dataSize() const
return std::accumulate(m_segmentSizes.cbegin(), m_segmentSizes.cend(), 0u);
* \brief Returns the total size of the page in byte.
inline uint32 OggPage::totalSize() const
return headerSize() + std::accumulate(m_segmentSizes.cbegin(), m_segmentSizes.cend(), 0);
return headerSize() + dataSize();

View File

@ -62,11 +62,8 @@ void OggStream::internalParseHeader()
const auto pred = bind(&OggPage::matchesStreamSerialNumber, _1, firstPage.streamSerialNumber());
// iterate through segments using OggIterator
// -> iterate through ALL segments to calculate the precise stream size (hence the out-commented part in the loop-condition)
for(bool hasIdentificationHeader = false, hasCommentHeader = false; iterator /* && (!hasIdentificationHeader && !hasCommentHeader) */; ++iterator) {
for(bool hasIdentificationHeader = false, hasCommentHeader = false; iterator && (!hasIdentificationHeader || !hasCommentHeader); ++iterator) {
const uint32 currentSize = iterator.currentSegmentSize();
m_size += currentSize;
if(currentSize >= 8) {
// determine stream format