Improve performance when parsing big OGG files
by skipping pages in the middle (unless a full parse is forced). Additionally, the size of the tracks is now determined on container-level which makes handling the skipping easier.
This commit is contained in:
parent
7ab83a55ec
commit
568f876b34
|
@ -186,24 +186,27 @@ void OggContainer::removeAllTags()
|
|||
void OggContainer::internalParseHeader()
|
||||
{
|
||||
static const string context("parsing OGG bitstream header");
|
||||
bool pagesSkipped = false;
|
||||
|
||||
// iterate through pages using OggIterator helper class
|
||||
try {
|
||||
// ensure iterator is setup properly
|
||||
for(m_iterator.removeFilter(), m_iterator.reset(); m_iterator; m_iterator.nextPage()) {
|
||||
const OggPage &page = m_iterator.currentPage();
|
||||
if(m_validateChecksums) {
|
||||
if(page.checksum() != OggPage::computeChecksum(stream(), page.startOffset())) {
|
||||
addNotification(NotificationType::Warning, "The denoted checksum of the OGG page at " % ConversionUtilities::numberToString(m_iterator.currentSegmentOffset()) + " does not match the computed checksum.", context);
|
||||
}
|
||||
if(m_validateChecksums && page.checksum() != OggPage::computeChecksum(stream(), page.startOffset())) {
|
||||
addNotification(NotificationType::Warning, argsToString("The denoted checksum of the OGG page at ", m_iterator.currentSegmentOffset(), " does not match the computed checksum."), context);
|
||||
}
|
||||
OggStream *stream;
|
||||
uint64 lastNewStreamOffset = 0;
|
||||
try {
|
||||
stream = m_tracks[m_streamsBySerialNo.at(page.streamSerialNumber())].get();
|
||||
stream->m_size += page.dataSize();
|
||||
} catch(const out_of_range &) {
|
||||
// new stream serial number recognized -> add new stream
|
||||
m_streamsBySerialNo[page.streamSerialNumber()] = m_tracks.size();
|
||||
m_tracks.emplace_back(make_unique<OggStream>(*this, m_iterator.currentPageIndex()));
|
||||
stream = m_tracks.back().get();
|
||||
lastNewStreamOffset = page.startOffset();
|
||||
}
|
||||
if(stream->m_currentSequenceNumber != page.sequenceNumber()) {
|
||||
if(stream->m_currentSequenceNumber) {
|
||||
|
@ -213,13 +216,39 @@ void OggContainer::internalParseHeader()
|
|||
} else {
|
||||
++stream->m_currentSequenceNumber;
|
||||
}
|
||||
|
||||
// skip pages in the middle of a big file (still more than 100 MiB to parse) if no new track has been seen since the last 20 MiB
|
||||
if(!fileInfo().isForcingFullParse()
|
||||
&& (fileInfo().size() - page.startOffset()) > (100 * 0x100000)
|
||||
&& (page.startOffset() - lastNewStreamOffset) > (20 * 0x100000)) {
|
||||
if(m_iterator.resyncAt(fileInfo().size() - (20 * 0x100000))) {
|
||||
const OggPage &resyncedPage = m_iterator.currentPage();
|
||||
// prevent warning about missing pages
|
||||
stream->m_currentSequenceNumber = resyncedPage.sequenceNumber() + 1;
|
||||
pagesSkipped = true;
|
||||
addNotification(NotificationType::Information,
|
||||
argsToString("Pages in the middle of the file (", dataSizeToString(resyncedPage.startOffset() - page.startOffset()) ,") have been skipped to improve parsing speed. Hence track sizes can not be computed. Maybe not even all tracks could be detected. Force a full parse to prevent this."),
|
||||
context);
|
||||
} else {
|
||||
// abort if skipping pages didn't work
|
||||
addNotification(NotificationType::Critical, "Unable to re-sync after skipping OGG pages in the middle of the file. Try forcing a full parse.", context);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch(const TruncatedDataException &) {
|
||||
// thrown when page exceeds max size
|
||||
addNotification(NotificationType::Critical, "The OGG file is truncated.", context);
|
||||
} catch(const InvalidDataException &) {
|
||||
// thrown when first 4 byte do not match capture pattern
|
||||
addNotification(NotificationType::Critical, "Capture pattern \"OggS\" at " % numberToString(m_iterator.currentSegmentOffset()) + " expected.", context);
|
||||
addNotification(NotificationType::Critical, argsToString("Capture pattern \"OggS\" at ", m_iterator.currentSegmentOffset(), " expected."), context);
|
||||
}
|
||||
|
||||
// invalidate stream sizes in case pages have been skipped
|
||||
if(pagesSkipped) {
|
||||
for(auto &stream : m_tracks) {
|
||||
stream->m_size = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -279,7 +308,7 @@ void OggContainer::internalParseTracks()
|
|||
m_duration = stream->duration();
|
||||
}
|
||||
} catch(const Failure &) {
|
||||
addNotification(NotificationType::Critical, "Unable to parse stream at " % numberToString(stream->startOffset()) + ".", context);
|
||||
addNotification(NotificationType::Critical, argsToString("Unable to parse stream at ", stream->startOffset(), '.'), context);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,9 +2,13 @@
|
|||
|
||||
#include "../exceptions.h"
|
||||
|
||||
#include <c++utilities/io/binaryreader.h>
|
||||
|
||||
#include <iostream>
|
||||
#include <limits>
|
||||
|
||||
using namespace std;
|
||||
using namespace IoUtilities;
|
||||
|
||||
namespace Media {
|
||||
|
||||
|
@ -155,7 +159,9 @@ void OggIterator::read(char *buffer, size_t count)
|
|||
* \remarks
|
||||
* - Might increase the current page index and/or the current segment index.
|
||||
* - Page headers are skipped (this is the whole purpose of this method).
|
||||
* - Does not read more than \a max bytes from the buffer.
|
||||
* - Does not write more than \a max bytes to the buffer.
|
||||
* \returns Returns the number of bytes read from the OGG stream. This might be less than \a max in
|
||||
* case not that many bytes were available.
|
||||
* \sa read()
|
||||
* \sa currentCharacterOffset()
|
||||
* \sa seekForward()
|
||||
|
@ -206,6 +212,68 @@ void OggIterator::ignore(size_t count)
|
|||
throw TruncatedDataException();
|
||||
}
|
||||
|
||||
/*!
|
||||
* \brief Fetches the next page at the specified \a offset.
|
||||
*
|
||||
* This allows to omit parts of a file which is useful to
|
||||
* - find the last page faster by skipping pages in the middle (last page is required for calculating
|
||||
* the files duration).
|
||||
* - recover parsing after after an error occured.
|
||||
*
|
||||
* Regardless of the current iterator position, this method will assume the page at \a offset comes after
|
||||
* the last known page. Hence \a offset must be greather than OggPage::startOffset() + OggPage::totalSize() of the
|
||||
* last known page. This is checked by the method.
|
||||
*
|
||||
* If the OGG capture pattern is not present at \a offset, up to 65307 bytes (max. size of an OGG page) are
|
||||
* skipped. So in a valid stream, this method will always succeed if \a offset is less than the stream size minus
|
||||
* 65307.
|
||||
*
|
||||
* If a page could be found, it is appended to pages() and the iterator position is set to the first segment of
|
||||
* that page. If no page could be found, this method does not alter the iterator.
|
||||
*
|
||||
* \returns Returns an indication whether a page could be found.
|
||||
* \throws Throws std::ios_base::failure when an IO error occurs.
|
||||
* \throws Throws Failure when a parsing error occurs.
|
||||
*/
|
||||
bool OggIterator::resyncAt(uint64 offset)
|
||||
{
|
||||
// check whether offset is valid
|
||||
if(offset >= streamSize() || offset < (m_pages.empty() ? m_startOffset : m_pages.back().startOffset() + m_pages.back().totalSize())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// find capture pattern 'OggS'
|
||||
stream().seekg(offset);
|
||||
byte lettersFound = 0;
|
||||
for(uint64 bytesAvailable = max<uint64>(streamSize() - offset, 65307ul); bytesAvailable >= 27; --bytesAvailable) {
|
||||
switch(static_cast<char>(stream().get())) {
|
||||
case 'O':
|
||||
lettersFound = 1;
|
||||
break;
|
||||
case 'g':
|
||||
lettersFound = lettersFound == 1 || lettersFound == 2 ? lettersFound + 1 : 0;
|
||||
break;
|
||||
case 'S':
|
||||
if(lettersFound == 3) {
|
||||
// capture pattern found
|
||||
const auto currentOffset = stream().tellg();
|
||||
// -> try to parse an OGG page at this position
|
||||
try {
|
||||
m_pages.emplace_back(stream(), static_cast<uint64>(stream().tellg()) - 4, bytesAvailable > numeric_limits<int32>::max() ? numeric_limits<int32>::max() : static_cast<int32>(bytesAvailable));
|
||||
setPageIndex(m_pages.size() - 1);
|
||||
return true;
|
||||
} catch (const Failure &) {
|
||||
stream().seekg(currentOffset);
|
||||
}
|
||||
}
|
||||
FALLTHROUGH;
|
||||
default:
|
||||
lettersFound = 0;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/*!
|
||||
* \brief Fetches the next page.
|
||||
*
|
||||
|
@ -221,7 +289,8 @@ bool OggIterator::fetchNextPage()
|
|||
if(m_page == m_pages.size()) { // can only fetch the next page if the current page is the last page
|
||||
m_offset = m_pages.empty() ? m_startOffset : m_pages.back().startOffset() + m_pages.back().totalSize();
|
||||
if(m_offset < m_streamSize) {
|
||||
m_pages.emplace_back(*m_stream, m_offset, static_cast<int32>(m_streamSize - m_offset));
|
||||
const uint64 bytesAvailable = m_streamSize - m_offset;
|
||||
m_pages.emplace_back(*m_stream, m_offset, bytesAvailable > numeric_limits<int32>::max() ? numeric_limits<int32>::max() : static_cast<int32>(bytesAvailable));
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -25,6 +25,7 @@ public:
|
|||
void previousSegment();
|
||||
const std::vector<OggPage> &pages() const;
|
||||
const OggPage ¤tPage() const;
|
||||
uint64 currentPageOffset() const;
|
||||
std::vector<OggPage>::size_type currentPageIndex() const;
|
||||
void setPageIndex(std::vector<OggPage>::size_type index);
|
||||
void setSegmentIndex(std::vector<uint32>::size_type index);
|
||||
|
@ -40,6 +41,7 @@ public:
|
|||
size_t readAll(char *buffer, std::size_t max);
|
||||
void ignore(std::size_t count = 1);
|
||||
bool bytesRemaining(std::size_t atLeast) const;
|
||||
bool resyncAt(uint64 offset);
|
||||
|
||||
operator bool() const;
|
||||
OggIterator &operator++();
|
||||
|
@ -131,6 +133,15 @@ inline const OggPage &OggIterator::currentPage() const
|
|||
return m_pages[m_page];
|
||||
}
|
||||
|
||||
/*!
|
||||
* \brief Returns the start offset of the current OGG page.
|
||||
* \remarks Calling this method when the iterator is invalid causes undefined behaviour.
|
||||
*/
|
||||
inline uint64 OggIterator::currentPageOffset() const
|
||||
{
|
||||
return m_pages[m_page].startOffset();
|
||||
}
|
||||
|
||||
/*!
|
||||
* \brief Returns an indication whether the iterator is valid.
|
||||
*
|
||||
|
@ -139,7 +150,7 @@ inline const OggPage &OggIterator::currentPage() const
|
|||
*
|
||||
* If the iterator is invalid, it can be reseted using the reset() method.
|
||||
*
|
||||
* Some methods might cause undefined behaviour if called on an invalid iterator.
|
||||
* Some methods cause undefined behaviour if called on an invalid iterator.
|
||||
*/
|
||||
inline OggIterator::operator bool() const
|
||||
{
|
||||
|
@ -156,8 +167,7 @@ inline std::vector<OggPage>::size_type OggIterator::currentPageIndex() const
|
|||
|
||||
/*!
|
||||
* \brief Sets the current page index.
|
||||
*
|
||||
* This method should never be called with an \a index out of range (which is the defined by the number of fetched pages), since this causes undefined behaviour.
|
||||
* \remarks This method should never be called with an \a index out of range (which is defined by the number of fetched pages), since this would cause undefined behaviour.
|
||||
*/
|
||||
inline void OggIterator::setPageIndex(std::vector<OggPage>::size_type index)
|
||||
{
|
||||
|
@ -250,6 +260,10 @@ inline void OggIterator::removeFilter()
|
|||
* This means that for each page in the stream in the specified range (stream and range have been specified when
|
||||
* constructing the iterator) an OggPage instance has been created and pushed to pages(). This is independend from
|
||||
* the current iterator position. Fetched pages remain after resetting the iterator.
|
||||
*
|
||||
* \remarks This is also true if pages in the middle of the file have been omitted because it is actually just checked
|
||||
* whether the last page has been fetched.
|
||||
* \todo Rename to isLastPageFetched() in next major release.
|
||||
*/
|
||||
inline bool OggIterator::areAllPagesFetched() const
|
||||
{
|
||||
|
|
|
@ -35,6 +35,7 @@ public:
|
|||
byte segmentTableSize() const;
|
||||
const std::vector<uint32> &segmentSizes() const;
|
||||
uint32 headerSize() const;
|
||||
uint32 dataSize() const;
|
||||
uint32 totalSize() const;
|
||||
uint64 dataOffset(byte segmentIndex = 0) const;
|
||||
static uint32 makeSegmentSizeDenotation(std::ostream &stream, uint32 size);
|
||||
|
@ -221,12 +222,20 @@ inline uint32 OggPage::headerSize() const
|
|||
return 27 + m_segmentCount;
|
||||
}
|
||||
|
||||
/*!
|
||||
* \brief Returns the data size in byte.
|
||||
*/
|
||||
inline uint32 OggPage::dataSize() const
|
||||
{
|
||||
return std::accumulate(m_segmentSizes.cbegin(), m_segmentSizes.cend(), 0u);
|
||||
}
|
||||
|
||||
/*!
|
||||
* \brief Returns the total size of the page in byte.
|
||||
*/
|
||||
inline uint32 OggPage::totalSize() const
|
||||
{
|
||||
return headerSize() + std::accumulate(m_segmentSizes.cbegin(), m_segmentSizes.cend(), 0);
|
||||
return headerSize() + dataSize();
|
||||
}
|
||||
|
||||
/*!
|
||||
|
|
|
@ -62,11 +62,8 @@ void OggStream::internalParseHeader()
|
|||
const auto pred = bind(&OggPage::matchesStreamSerialNumber, _1, firstPage.streamSerialNumber());
|
||||
|
||||
// iterate through segments using OggIterator
|
||||
// -> iterate through ALL segments to calculate the precise stream size (hence the out-commented part in the loop-condition)
|
||||
for(bool hasIdentificationHeader = false, hasCommentHeader = false; iterator /* && (!hasIdentificationHeader && !hasCommentHeader) */; ++iterator) {
|
||||
for(bool hasIdentificationHeader = false, hasCommentHeader = false; iterator && (!hasIdentificationHeader || !hasCommentHeader); ++iterator) {
|
||||
const uint32 currentSize = iterator.currentSegmentSize();
|
||||
m_size += currentSize;
|
||||
|
||||
if(currentSize >= 8) {
|
||||
// determine stream format
|
||||
inputStream().seekg(iterator.currentSegmentOffset());
|
||||
|
|
Loading…
Reference in New Issue