Improve performance when parsing big OGG files
by skipping pages in the middle (unless a full parse is forced). Additionally, the size of the tracks is now determined on container-level which makes handling the skipping easier.
This commit is contained in:
parent
7ab83a55ec
commit
568f876b34
|
@ -186,24 +186,27 @@ void OggContainer::removeAllTags()
|
||||||
void OggContainer::internalParseHeader()
|
void OggContainer::internalParseHeader()
|
||||||
{
|
{
|
||||||
static const string context("parsing OGG bitstream header");
|
static const string context("parsing OGG bitstream header");
|
||||||
|
bool pagesSkipped = false;
|
||||||
|
|
||||||
// iterate through pages using OggIterator helper class
|
// iterate through pages using OggIterator helper class
|
||||||
try {
|
try {
|
||||||
// ensure iterator is setup properly
|
// ensure iterator is setup properly
|
||||||
for(m_iterator.removeFilter(), m_iterator.reset(); m_iterator; m_iterator.nextPage()) {
|
for(m_iterator.removeFilter(), m_iterator.reset(); m_iterator; m_iterator.nextPage()) {
|
||||||
const OggPage &page = m_iterator.currentPage();
|
const OggPage &page = m_iterator.currentPage();
|
||||||
if(m_validateChecksums) {
|
if(m_validateChecksums && page.checksum() != OggPage::computeChecksum(stream(), page.startOffset())) {
|
||||||
if(page.checksum() != OggPage::computeChecksum(stream(), page.startOffset())) {
|
addNotification(NotificationType::Warning, argsToString("The denoted checksum of the OGG page at ", m_iterator.currentSegmentOffset(), " does not match the computed checksum."), context);
|
||||||
addNotification(NotificationType::Warning, "The denoted checksum of the OGG page at " % ConversionUtilities::numberToString(m_iterator.currentSegmentOffset()) + " does not match the computed checksum.", context);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
OggStream *stream;
|
OggStream *stream;
|
||||||
|
uint64 lastNewStreamOffset = 0;
|
||||||
try {
|
try {
|
||||||
stream = m_tracks[m_streamsBySerialNo.at(page.streamSerialNumber())].get();
|
stream = m_tracks[m_streamsBySerialNo.at(page.streamSerialNumber())].get();
|
||||||
|
stream->m_size += page.dataSize();
|
||||||
} catch(const out_of_range &) {
|
} catch(const out_of_range &) {
|
||||||
// new stream serial number recognized -> add new stream
|
// new stream serial number recognized -> add new stream
|
||||||
m_streamsBySerialNo[page.streamSerialNumber()] = m_tracks.size();
|
m_streamsBySerialNo[page.streamSerialNumber()] = m_tracks.size();
|
||||||
m_tracks.emplace_back(make_unique<OggStream>(*this, m_iterator.currentPageIndex()));
|
m_tracks.emplace_back(make_unique<OggStream>(*this, m_iterator.currentPageIndex()));
|
||||||
stream = m_tracks.back().get();
|
stream = m_tracks.back().get();
|
||||||
|
lastNewStreamOffset = page.startOffset();
|
||||||
}
|
}
|
||||||
if(stream->m_currentSequenceNumber != page.sequenceNumber()) {
|
if(stream->m_currentSequenceNumber != page.sequenceNumber()) {
|
||||||
if(stream->m_currentSequenceNumber) {
|
if(stream->m_currentSequenceNumber) {
|
||||||
|
@ -213,13 +216,39 @@ void OggContainer::internalParseHeader()
|
||||||
} else {
|
} else {
|
||||||
++stream->m_currentSequenceNumber;
|
++stream->m_currentSequenceNumber;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// skip pages in the middle of a big file (still more than 100 MiB to parse) if no new track has been seen since the last 20 MiB
|
||||||
|
if(!fileInfo().isForcingFullParse()
|
||||||
|
&& (fileInfo().size() - page.startOffset()) > (100 * 0x100000)
|
||||||
|
&& (page.startOffset() - lastNewStreamOffset) > (20 * 0x100000)) {
|
||||||
|
if(m_iterator.resyncAt(fileInfo().size() - (20 * 0x100000))) {
|
||||||
|
const OggPage &resyncedPage = m_iterator.currentPage();
|
||||||
|
// prevent warning about missing pages
|
||||||
|
stream->m_currentSequenceNumber = resyncedPage.sequenceNumber() + 1;
|
||||||
|
pagesSkipped = true;
|
||||||
|
addNotification(NotificationType::Information,
|
||||||
|
argsToString("Pages in the middle of the file (", dataSizeToString(resyncedPage.startOffset() - page.startOffset()) ,") have been skipped to improve parsing speed. Hence track sizes can not be computed. Maybe not even all tracks could be detected. Force a full parse to prevent this."),
|
||||||
|
context);
|
||||||
|
} else {
|
||||||
|
// abort if skipping pages didn't work
|
||||||
|
addNotification(NotificationType::Critical, "Unable to re-sync after skipping OGG pages in the middle of the file. Try forcing a full parse.", context);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} catch(const TruncatedDataException &) {
|
} catch(const TruncatedDataException &) {
|
||||||
// thrown when page exceeds max size
|
// thrown when page exceeds max size
|
||||||
addNotification(NotificationType::Critical, "The OGG file is truncated.", context);
|
addNotification(NotificationType::Critical, "The OGG file is truncated.", context);
|
||||||
} catch(const InvalidDataException &) {
|
} catch(const InvalidDataException &) {
|
||||||
// thrown when first 4 byte do not match capture pattern
|
// thrown when first 4 byte do not match capture pattern
|
||||||
addNotification(NotificationType::Critical, "Capture pattern \"OggS\" at " % numberToString(m_iterator.currentSegmentOffset()) + " expected.", context);
|
addNotification(NotificationType::Critical, argsToString("Capture pattern \"OggS\" at ", m_iterator.currentSegmentOffset(), " expected."), context);
|
||||||
|
}
|
||||||
|
|
||||||
|
// invalidate stream sizes in case pages have been skipped
|
||||||
|
if(pagesSkipped) {
|
||||||
|
for(auto &stream : m_tracks) {
|
||||||
|
stream->m_size = 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -279,7 +308,7 @@ void OggContainer::internalParseTracks()
|
||||||
m_duration = stream->duration();
|
m_duration = stream->duration();
|
||||||
}
|
}
|
||||||
} catch(const Failure &) {
|
} catch(const Failure &) {
|
||||||
addNotification(NotificationType::Critical, "Unable to parse stream at " % numberToString(stream->startOffset()) + ".", context);
|
addNotification(NotificationType::Critical, argsToString("Unable to parse stream at ", stream->startOffset(), '.'), context);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,9 +2,13 @@
|
||||||
|
|
||||||
#include "../exceptions.h"
|
#include "../exceptions.h"
|
||||||
|
|
||||||
|
#include <c++utilities/io/binaryreader.h>
|
||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
#include <limits>
|
||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
using namespace IoUtilities;
|
||||||
|
|
||||||
namespace Media {
|
namespace Media {
|
||||||
|
|
||||||
|
@ -155,7 +159,9 @@ void OggIterator::read(char *buffer, size_t count)
|
||||||
* \remarks
|
* \remarks
|
||||||
* - Might increase the current page index and/or the current segment index.
|
* - Might increase the current page index and/or the current segment index.
|
||||||
* - Page headers are skipped (this is the whole purpose of this method).
|
* - Page headers are skipped (this is the whole purpose of this method).
|
||||||
* - Does not read more than \a max bytes from the buffer.
|
* - Does not write more than \a max bytes to the buffer.
|
||||||
|
* \returns Returns the number of bytes read from the OGG stream. This might be less than \a max in
|
||||||
|
* case not that many bytes were available.
|
||||||
* \sa read()
|
* \sa read()
|
||||||
* \sa currentCharacterOffset()
|
* \sa currentCharacterOffset()
|
||||||
* \sa seekForward()
|
* \sa seekForward()
|
||||||
|
@ -206,6 +212,68 @@ void OggIterator::ignore(size_t count)
|
||||||
throw TruncatedDataException();
|
throw TruncatedDataException();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \brief Fetches the next page at the specified \a offset.
|
||||||
|
*
|
||||||
|
* This allows to omit parts of a file which is useful to
|
||||||
|
* - find the last page faster by skipping pages in the middle (last page is required for calculating
|
||||||
|
* the files duration).
|
||||||
|
* - recover parsing after after an error occured.
|
||||||
|
*
|
||||||
|
* Regardless of the current iterator position, this method will assume the page at \a offset comes after
|
||||||
|
* the last known page. Hence \a offset must be greather than OggPage::startOffset() + OggPage::totalSize() of the
|
||||||
|
* last known page. This is checked by the method.
|
||||||
|
*
|
||||||
|
* If the OGG capture pattern is not present at \a offset, up to 65307 bytes (max. size of an OGG page) are
|
||||||
|
* skipped. So in a valid stream, this method will always succeed if \a offset is less than the stream size minus
|
||||||
|
* 65307.
|
||||||
|
*
|
||||||
|
* If a page could be found, it is appended to pages() and the iterator position is set to the first segment of
|
||||||
|
* that page. If no page could be found, this method does not alter the iterator.
|
||||||
|
*
|
||||||
|
* \returns Returns an indication whether a page could be found.
|
||||||
|
* \throws Throws std::ios_base::failure when an IO error occurs.
|
||||||
|
* \throws Throws Failure when a parsing error occurs.
|
||||||
|
*/
|
||||||
|
bool OggIterator::resyncAt(uint64 offset)
|
||||||
|
{
|
||||||
|
// check whether offset is valid
|
||||||
|
if(offset >= streamSize() || offset < (m_pages.empty() ? m_startOffset : m_pages.back().startOffset() + m_pages.back().totalSize())) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// find capture pattern 'OggS'
|
||||||
|
stream().seekg(offset);
|
||||||
|
byte lettersFound = 0;
|
||||||
|
for(uint64 bytesAvailable = max<uint64>(streamSize() - offset, 65307ul); bytesAvailable >= 27; --bytesAvailable) {
|
||||||
|
switch(static_cast<char>(stream().get())) {
|
||||||
|
case 'O':
|
||||||
|
lettersFound = 1;
|
||||||
|
break;
|
||||||
|
case 'g':
|
||||||
|
lettersFound = lettersFound == 1 || lettersFound == 2 ? lettersFound + 1 : 0;
|
||||||
|
break;
|
||||||
|
case 'S':
|
||||||
|
if(lettersFound == 3) {
|
||||||
|
// capture pattern found
|
||||||
|
const auto currentOffset = stream().tellg();
|
||||||
|
// -> try to parse an OGG page at this position
|
||||||
|
try {
|
||||||
|
m_pages.emplace_back(stream(), static_cast<uint64>(stream().tellg()) - 4, bytesAvailable > numeric_limits<int32>::max() ? numeric_limits<int32>::max() : static_cast<int32>(bytesAvailable));
|
||||||
|
setPageIndex(m_pages.size() - 1);
|
||||||
|
return true;
|
||||||
|
} catch (const Failure &) {
|
||||||
|
stream().seekg(currentOffset);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
FALLTHROUGH;
|
||||||
|
default:
|
||||||
|
lettersFound = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* \brief Fetches the next page.
|
* \brief Fetches the next page.
|
||||||
*
|
*
|
||||||
|
@ -221,7 +289,8 @@ bool OggIterator::fetchNextPage()
|
||||||
if(m_page == m_pages.size()) { // can only fetch the next page if the current page is the last page
|
if(m_page == m_pages.size()) { // can only fetch the next page if the current page is the last page
|
||||||
m_offset = m_pages.empty() ? m_startOffset : m_pages.back().startOffset() + m_pages.back().totalSize();
|
m_offset = m_pages.empty() ? m_startOffset : m_pages.back().startOffset() + m_pages.back().totalSize();
|
||||||
if(m_offset < m_streamSize) {
|
if(m_offset < m_streamSize) {
|
||||||
m_pages.emplace_back(*m_stream, m_offset, static_cast<int32>(m_streamSize - m_offset));
|
const uint64 bytesAvailable = m_streamSize - m_offset;
|
||||||
|
m_pages.emplace_back(*m_stream, m_offset, bytesAvailable > numeric_limits<int32>::max() ? numeric_limits<int32>::max() : static_cast<int32>(bytesAvailable));
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -25,6 +25,7 @@ public:
|
||||||
void previousSegment();
|
void previousSegment();
|
||||||
const std::vector<OggPage> &pages() const;
|
const std::vector<OggPage> &pages() const;
|
||||||
const OggPage ¤tPage() const;
|
const OggPage ¤tPage() const;
|
||||||
|
uint64 currentPageOffset() const;
|
||||||
std::vector<OggPage>::size_type currentPageIndex() const;
|
std::vector<OggPage>::size_type currentPageIndex() const;
|
||||||
void setPageIndex(std::vector<OggPage>::size_type index);
|
void setPageIndex(std::vector<OggPage>::size_type index);
|
||||||
void setSegmentIndex(std::vector<uint32>::size_type index);
|
void setSegmentIndex(std::vector<uint32>::size_type index);
|
||||||
|
@ -40,6 +41,7 @@ public:
|
||||||
size_t readAll(char *buffer, std::size_t max);
|
size_t readAll(char *buffer, std::size_t max);
|
||||||
void ignore(std::size_t count = 1);
|
void ignore(std::size_t count = 1);
|
||||||
bool bytesRemaining(std::size_t atLeast) const;
|
bool bytesRemaining(std::size_t atLeast) const;
|
||||||
|
bool resyncAt(uint64 offset);
|
||||||
|
|
||||||
operator bool() const;
|
operator bool() const;
|
||||||
OggIterator &operator++();
|
OggIterator &operator++();
|
||||||
|
@ -131,6 +133,15 @@ inline const OggPage &OggIterator::currentPage() const
|
||||||
return m_pages[m_page];
|
return m_pages[m_page];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \brief Returns the start offset of the current OGG page.
|
||||||
|
* \remarks Calling this method when the iterator is invalid causes undefined behaviour.
|
||||||
|
*/
|
||||||
|
inline uint64 OggIterator::currentPageOffset() const
|
||||||
|
{
|
||||||
|
return m_pages[m_page].startOffset();
|
||||||
|
}
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* \brief Returns an indication whether the iterator is valid.
|
* \brief Returns an indication whether the iterator is valid.
|
||||||
*
|
*
|
||||||
|
@ -139,7 +150,7 @@ inline const OggPage &OggIterator::currentPage() const
|
||||||
*
|
*
|
||||||
* If the iterator is invalid, it can be reseted using the reset() method.
|
* If the iterator is invalid, it can be reseted using the reset() method.
|
||||||
*
|
*
|
||||||
* Some methods might cause undefined behaviour if called on an invalid iterator.
|
* Some methods cause undefined behaviour if called on an invalid iterator.
|
||||||
*/
|
*/
|
||||||
inline OggIterator::operator bool() const
|
inline OggIterator::operator bool() const
|
||||||
{
|
{
|
||||||
|
@ -156,8 +167,7 @@ inline std::vector<OggPage>::size_type OggIterator::currentPageIndex() const
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* \brief Sets the current page index.
|
* \brief Sets the current page index.
|
||||||
*
|
* \remarks This method should never be called with an \a index out of range (which is defined by the number of fetched pages), since this would cause undefined behaviour.
|
||||||
* This method should never be called with an \a index out of range (which is the defined by the number of fetched pages), since this causes undefined behaviour.
|
|
||||||
*/
|
*/
|
||||||
inline void OggIterator::setPageIndex(std::vector<OggPage>::size_type index)
|
inline void OggIterator::setPageIndex(std::vector<OggPage>::size_type index)
|
||||||
{
|
{
|
||||||
|
@ -250,6 +260,10 @@ inline void OggIterator::removeFilter()
|
||||||
* This means that for each page in the stream in the specified range (stream and range have been specified when
|
* This means that for each page in the stream in the specified range (stream and range have been specified when
|
||||||
* constructing the iterator) an OggPage instance has been created and pushed to pages(). This is independend from
|
* constructing the iterator) an OggPage instance has been created and pushed to pages(). This is independend from
|
||||||
* the current iterator position. Fetched pages remain after resetting the iterator.
|
* the current iterator position. Fetched pages remain after resetting the iterator.
|
||||||
|
*
|
||||||
|
* \remarks This is also true if pages in the middle of the file have been omitted because it is actually just checked
|
||||||
|
* whether the last page has been fetched.
|
||||||
|
* \todo Rename to isLastPageFetched() in next major release.
|
||||||
*/
|
*/
|
||||||
inline bool OggIterator::areAllPagesFetched() const
|
inline bool OggIterator::areAllPagesFetched() const
|
||||||
{
|
{
|
||||||
|
|
|
@ -35,6 +35,7 @@ public:
|
||||||
byte segmentTableSize() const;
|
byte segmentTableSize() const;
|
||||||
const std::vector<uint32> &segmentSizes() const;
|
const std::vector<uint32> &segmentSizes() const;
|
||||||
uint32 headerSize() const;
|
uint32 headerSize() const;
|
||||||
|
uint32 dataSize() const;
|
||||||
uint32 totalSize() const;
|
uint32 totalSize() const;
|
||||||
uint64 dataOffset(byte segmentIndex = 0) const;
|
uint64 dataOffset(byte segmentIndex = 0) const;
|
||||||
static uint32 makeSegmentSizeDenotation(std::ostream &stream, uint32 size);
|
static uint32 makeSegmentSizeDenotation(std::ostream &stream, uint32 size);
|
||||||
|
@ -221,12 +222,20 @@ inline uint32 OggPage::headerSize() const
|
||||||
return 27 + m_segmentCount;
|
return 27 + m_segmentCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \brief Returns the data size in byte.
|
||||||
|
*/
|
||||||
|
inline uint32 OggPage::dataSize() const
|
||||||
|
{
|
||||||
|
return std::accumulate(m_segmentSizes.cbegin(), m_segmentSizes.cend(), 0u);
|
||||||
|
}
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* \brief Returns the total size of the page in byte.
|
* \brief Returns the total size of the page in byte.
|
||||||
*/
|
*/
|
||||||
inline uint32 OggPage::totalSize() const
|
inline uint32 OggPage::totalSize() const
|
||||||
{
|
{
|
||||||
return headerSize() + std::accumulate(m_segmentSizes.cbegin(), m_segmentSizes.cend(), 0);
|
return headerSize() + dataSize();
|
||||||
}
|
}
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
|
|
|
@ -62,11 +62,8 @@ void OggStream::internalParseHeader()
|
||||||
const auto pred = bind(&OggPage::matchesStreamSerialNumber, _1, firstPage.streamSerialNumber());
|
const auto pred = bind(&OggPage::matchesStreamSerialNumber, _1, firstPage.streamSerialNumber());
|
||||||
|
|
||||||
// iterate through segments using OggIterator
|
// iterate through segments using OggIterator
|
||||||
// -> iterate through ALL segments to calculate the precise stream size (hence the out-commented part in the loop-condition)
|
for(bool hasIdentificationHeader = false, hasCommentHeader = false; iterator && (!hasIdentificationHeader || !hasCommentHeader); ++iterator) {
|
||||||
for(bool hasIdentificationHeader = false, hasCommentHeader = false; iterator /* && (!hasIdentificationHeader && !hasCommentHeader) */; ++iterator) {
|
|
||||||
const uint32 currentSize = iterator.currentSegmentSize();
|
const uint32 currentSize = iterator.currentSegmentSize();
|
||||||
m_size += currentSize;
|
|
||||||
|
|
||||||
if(currentSize >= 8) {
|
if(currentSize >= 8) {
|
||||||
// determine stream format
|
// determine stream format
|
||||||
inputStream().seekg(iterator.currentSegmentOffset());
|
inputStream().seekg(iterator.currentSegmentOffset());
|
||||||
|
|
Loading…
Reference in New Issue