Improve performance when parsing big OGG files

by skipping pages in the middle (unless a full parse
is forced).

Additionally, the size of the tracks is now determined
on container-level which makes handling the skipping
easier.
This commit is contained in:
Martchus 2017-08-29 01:29:27 +02:00
parent 7ab83a55ec
commit 568f876b34
5 changed files with 134 additions and 16 deletions

View File

@ -186,24 +186,27 @@ void OggContainer::removeAllTags()
void OggContainer::internalParseHeader()
{
static const string context("parsing OGG bitstream header");
bool pagesSkipped = false;
// iterate through pages using OggIterator helper class
try {
// ensure iterator is setup properly
for(m_iterator.removeFilter(), m_iterator.reset(); m_iterator; m_iterator.nextPage()) {
const OggPage &page = m_iterator.currentPage();
if(m_validateChecksums) {
if(page.checksum() != OggPage::computeChecksum(stream(), page.startOffset())) {
addNotification(NotificationType::Warning, "The denoted checksum of the OGG page at " % ConversionUtilities::numberToString(m_iterator.currentSegmentOffset()) + " does not match the computed checksum.", context);
}
if(m_validateChecksums && page.checksum() != OggPage::computeChecksum(stream(), page.startOffset())) {
addNotification(NotificationType::Warning, argsToString("The denoted checksum of the OGG page at ", m_iterator.currentSegmentOffset(), " does not match the computed checksum."), context);
}
OggStream *stream;
uint64 lastNewStreamOffset = 0;
try {
stream = m_tracks[m_streamsBySerialNo.at(page.streamSerialNumber())].get();
stream->m_size += page.dataSize();
} catch(const out_of_range &) {
// new stream serial number recognized -> add new stream
m_streamsBySerialNo[page.streamSerialNumber()] = m_tracks.size();
m_tracks.emplace_back(make_unique<OggStream>(*this, m_iterator.currentPageIndex()));
stream = m_tracks.back().get();
lastNewStreamOffset = page.startOffset();
}
if(stream->m_currentSequenceNumber != page.sequenceNumber()) {
if(stream->m_currentSequenceNumber) {
@ -213,13 +216,39 @@ void OggContainer::internalParseHeader()
} else {
++stream->m_currentSequenceNumber;
}
// skip pages in the middle of a big file (still more than 100 MiB to parse) if no new track has been seen since the last 20 MiB
if(!fileInfo().isForcingFullParse()
&& (fileInfo().size() - page.startOffset()) > (100 * 0x100000)
&& (page.startOffset() - lastNewStreamOffset) > (20 * 0x100000)) {
if(m_iterator.resyncAt(fileInfo().size() - (20 * 0x100000))) {
const OggPage &resyncedPage = m_iterator.currentPage();
// prevent warning about missing pages
stream->m_currentSequenceNumber = resyncedPage.sequenceNumber() + 1;
pagesSkipped = true;
addNotification(NotificationType::Information,
argsToString("Pages in the middle of the file (", dataSizeToString(resyncedPage.startOffset() - page.startOffset()) ,") have been skipped to improve parsing speed. Hence track sizes can not be computed. Maybe not even all tracks could be detected. Force a full parse to prevent this."),
context);
} else {
// abort if skipping pages didn't work
addNotification(NotificationType::Critical, "Unable to re-sync after skipping OGG pages in the middle of the file. Try forcing a full parse.", context);
return;
}
}
}
} catch(const TruncatedDataException &) {
// thrown when page exceeds max size
addNotification(NotificationType::Critical, "The OGG file is truncated.", context);
} catch(const InvalidDataException &) {
// thrown when first 4 byte do not match capture pattern
addNotification(NotificationType::Critical, "Capture pattern \"OggS\" at " % numberToString(m_iterator.currentSegmentOffset()) + " expected.", context);
addNotification(NotificationType::Critical, argsToString("Capture pattern \"OggS\" at ", m_iterator.currentSegmentOffset(), " expected."), context);
}
// invalidate stream sizes in case pages have been skipped
if(pagesSkipped) {
for(auto &stream : m_tracks) {
stream->m_size = 0;
}
}
}
@ -279,7 +308,7 @@ void OggContainer::internalParseTracks()
m_duration = stream->duration();
}
} catch(const Failure &) {
addNotification(NotificationType::Critical, "Unable to parse stream at " % numberToString(stream->startOffset()) + ".", context);
addNotification(NotificationType::Critical, argsToString("Unable to parse stream at ", stream->startOffset(), '.'), context);
}
}
}

View File

@ -2,9 +2,13 @@
#include "../exceptions.h"
#include <c++utilities/io/binaryreader.h>
#include <iostream>
#include <limits>
using namespace std;
using namespace IoUtilities;
namespace Media {
@ -155,7 +159,9 @@ void OggIterator::read(char *buffer, size_t count)
* \remarks
* - Might increase the current page index and/or the current segment index.
* - Page headers are skipped (this is the whole purpose of this method).
* - Does not read more than \a max bytes from the buffer.
* - Does not write more than \a max bytes to the buffer.
* \returns Returns the number of bytes read from the OGG stream. This might be less than \a max in
* case not that many bytes were available.
* \sa read()
* \sa currentCharacterOffset()
* \sa seekForward()
@ -206,6 +212,68 @@ void OggIterator::ignore(size_t count)
throw TruncatedDataException();
}
/*!
* \brief Fetches the next page at the specified \a offset.
*
* This allows to omit parts of a file which is useful to
* - find the last page faster by skipping pages in the middle (last page is required for calculating
* the files duration).
* - recover parsing after after an error occured.
*
* Regardless of the current iterator position, this method will assume the page at \a offset comes after
* the last known page. Hence \a offset must be greather than OggPage::startOffset() + OggPage::totalSize() of the
* last known page. This is checked by the method.
*
* If the OGG capture pattern is not present at \a offset, up to 65307 bytes (max. size of an OGG page) are
* skipped. So in a valid stream, this method will always succeed if \a offset is less than the stream size minus
* 65307.
*
* If a page could be found, it is appended to pages() and the iterator position is set to the first segment of
* that page. If no page could be found, this method does not alter the iterator.
*
* \returns Returns an indication whether a page could be found.
* \throws Throws std::ios_base::failure when an IO error occurs.
* \throws Throws Failure when a parsing error occurs.
*/
bool OggIterator::resyncAt(uint64 offset)
{
// check whether offset is valid
if(offset >= streamSize() || offset < (m_pages.empty() ? m_startOffset : m_pages.back().startOffset() + m_pages.back().totalSize())) {
return false;
}
// find capture pattern 'OggS'
stream().seekg(offset);
byte lettersFound = 0;
for(uint64 bytesAvailable = max<uint64>(streamSize() - offset, 65307ul); bytesAvailable >= 27; --bytesAvailable) {
switch(static_cast<char>(stream().get())) {
case 'O':
lettersFound = 1;
break;
case 'g':
lettersFound = lettersFound == 1 || lettersFound == 2 ? lettersFound + 1 : 0;
break;
case 'S':
if(lettersFound == 3) {
// capture pattern found
const auto currentOffset = stream().tellg();
// -> try to parse an OGG page at this position
try {
m_pages.emplace_back(stream(), static_cast<uint64>(stream().tellg()) - 4, bytesAvailable > numeric_limits<int32>::max() ? numeric_limits<int32>::max() : static_cast<int32>(bytesAvailable));
setPageIndex(m_pages.size() - 1);
return true;
} catch (const Failure &) {
stream().seekg(currentOffset);
}
}
FALLTHROUGH;
default:
lettersFound = 0;
}
}
return false;
}
/*!
* \brief Fetches the next page.
*
@ -221,7 +289,8 @@ bool OggIterator::fetchNextPage()
if(m_page == m_pages.size()) { // can only fetch the next page if the current page is the last page
m_offset = m_pages.empty() ? m_startOffset : m_pages.back().startOffset() + m_pages.back().totalSize();
if(m_offset < m_streamSize) {
m_pages.emplace_back(*m_stream, m_offset, static_cast<int32>(m_streamSize - m_offset));
const uint64 bytesAvailable = m_streamSize - m_offset;
m_pages.emplace_back(*m_stream, m_offset, bytesAvailable > numeric_limits<int32>::max() ? numeric_limits<int32>::max() : static_cast<int32>(bytesAvailable));
return true;
}
}

View File

@ -25,6 +25,7 @@ public:
void previousSegment();
const std::vector<OggPage> &pages() const;
const OggPage &currentPage() const;
uint64 currentPageOffset() const;
std::vector<OggPage>::size_type currentPageIndex() const;
void setPageIndex(std::vector<OggPage>::size_type index);
void setSegmentIndex(std::vector<uint32>::size_type index);
@ -40,6 +41,7 @@ public:
size_t readAll(char *buffer, std::size_t max);
void ignore(std::size_t count = 1);
bool bytesRemaining(std::size_t atLeast) const;
bool resyncAt(uint64 offset);
operator bool() const;
OggIterator &operator++();
@ -131,6 +133,15 @@ inline const OggPage &OggIterator::currentPage() const
return m_pages[m_page];
}
/*!
* \brief Returns the start offset of the current OGG page.
* \remarks Calling this method when the iterator is invalid causes undefined behaviour.
*/
inline uint64 OggIterator::currentPageOffset() const
{
return m_pages[m_page].startOffset();
}
/*!
* \brief Returns an indication whether the iterator is valid.
*
@ -139,7 +150,7 @@ inline const OggPage &OggIterator::currentPage() const
*
* If the iterator is invalid, it can be reseted using the reset() method.
*
* Some methods might cause undefined behaviour if called on an invalid iterator.
* Some methods cause undefined behaviour if called on an invalid iterator.
*/
inline OggIterator::operator bool() const
{
@ -156,8 +167,7 @@ inline std::vector<OggPage>::size_type OggIterator::currentPageIndex() const
/*!
* \brief Sets the current page index.
*
* This method should never be called with an \a index out of range (which is the defined by the number of fetched pages), since this causes undefined behaviour.
* \remarks This method should never be called with an \a index out of range (which is defined by the number of fetched pages), since this would cause undefined behaviour.
*/
inline void OggIterator::setPageIndex(std::vector<OggPage>::size_type index)
{
@ -250,6 +260,10 @@ inline void OggIterator::removeFilter()
* This means that for each page in the stream in the specified range (stream and range have been specified when
* constructing the iterator) an OggPage instance has been created and pushed to pages(). This is independend from
* the current iterator position. Fetched pages remain after resetting the iterator.
*
* \remarks This is also true if pages in the middle of the file have been omitted because it is actually just checked
* whether the last page has been fetched.
* \todo Rename to isLastPageFetched() in next major release.
*/
inline bool OggIterator::areAllPagesFetched() const
{

View File

@ -35,6 +35,7 @@ public:
byte segmentTableSize() const;
const std::vector<uint32> &segmentSizes() const;
uint32 headerSize() const;
uint32 dataSize() const;
uint32 totalSize() const;
uint64 dataOffset(byte segmentIndex = 0) const;
static uint32 makeSegmentSizeDenotation(std::ostream &stream, uint32 size);
@ -221,12 +222,20 @@ inline uint32 OggPage::headerSize() const
return 27 + m_segmentCount;
}
/*!
* \brief Returns the data size in byte.
*/
inline uint32 OggPage::dataSize() const
{
return std::accumulate(m_segmentSizes.cbegin(), m_segmentSizes.cend(), 0u);
}
/*!
* \brief Returns the total size of the page in byte.
*/
inline uint32 OggPage::totalSize() const
{
return headerSize() + std::accumulate(m_segmentSizes.cbegin(), m_segmentSizes.cend(), 0);
return headerSize() + dataSize();
}
/*!

View File

@ -62,11 +62,8 @@ void OggStream::internalParseHeader()
const auto pred = bind(&OggPage::matchesStreamSerialNumber, _1, firstPage.streamSerialNumber());
// iterate through segments using OggIterator
// -> iterate through ALL segments to calculate the precise stream size (hence the out-commented part in the loop-condition)
for(bool hasIdentificationHeader = false, hasCommentHeader = false; iterator /* && (!hasIdentificationHeader && !hasCommentHeader) */; ++iterator) {
for(bool hasIdentificationHeader = false, hasCommentHeader = false; iterator && (!hasIdentificationHeader || !hasCommentHeader); ++iterator) {
const uint32 currentSize = iterator.currentSegmentSize();
m_size += currentSize;
if(currentSize >= 8) {
// determine stream format
inputStream().seekg(iterator.currentSegmentOffset());