diff --git a/mapillary_tools/geotag/blackvue_parser.py b/mapillary_tools/geotag/blackvue_parser.py index a34d5327..99fc92ba 100644 --- a/mapillary_tools/geotag/blackvue_parser.py +++ b/mapillary_tools/geotag/blackvue_parser.py @@ -7,7 +7,7 @@ import pynmea2 from .. import geo -from . import simple_mp4_parser +from ..mp4 import simple_mp4_parser as sparser LOG = logging.getLogger(__name__) @@ -55,8 +55,8 @@ def _parse_gps_box(gps_data: bytes) -> T.Generator[geo.Point, None, None]: def extract_camera_model(fp: T.BinaryIO) -> str: try: - cprt_bytes = simple_mp4_parser.parse_mp4_data_first(fp, [b"free", b"cprt"]) - except simple_mp4_parser.ParsingError: + cprt_bytes = sparser.parse_mp4_data_first(fp, [b"free", b"cprt"]) + except sparser.ParsingError: return "" if cprt_bytes is None: @@ -91,7 +91,7 @@ def extract_camera_model(fp: T.BinaryIO) -> str: def extract_points(fp: T.BinaryIO) -> T.Optional[T.List[geo.Point]]: - gps_data = simple_mp4_parser.parse_mp4_data_first(fp, [b"free", b"gps "]) + gps_data = sparser.parse_mp4_data_first(fp, [b"free", b"gps "]) if gps_data is None: return None diff --git a/mapillary_tools/geotag/camm_builder.py b/mapillary_tools/geotag/camm_builder.py index 5ff61e35..5a013f57 100644 --- a/mapillary_tools/geotag/camm_builder.py +++ b/mapillary_tools/geotag/camm_builder.py @@ -2,11 +2,13 @@ import typing as T from .. import geo, types +from ..mp4 import ( + construct_mp4_parser as cparser, + mp4_sample_parser as sample_parser, +) from . import ( camm_parser, - construct_mp4_parser as cparser, - mp4_sample_parser as sample_parser, simple_mp4_builder as builder, ) from .simple_mp4_builder import BoxDict diff --git a/mapillary_tools/geotag/camm_parser.py b/mapillary_tools/geotag/camm_parser.py index 994769d4..f93b7ffd 100644 --- a/mapillary_tools/geotag/camm_parser.py +++ b/mapillary_tools/geotag/camm_parser.py @@ -9,12 +9,8 @@ import construct as C -from . import ( - construct_mp4_parser as cparser, - geo, - mp4_sample_parser as sample_parser, - simple_mp4_parser as parser, -) +from . import geo +from ..mp4 import simple_mp4_parser as sparser, mp4_sample_parser as sample_parser LOG = logging.getLogger(__name__) @@ -82,12 +78,12 @@ class CAMMType(Enum): def _parse_point_from_sample( fp: T.BinaryIO, sample: sample_parser.Sample ) -> T.Optional[geo.Point]: - fp.seek(sample.offset, io.SEEK_SET) - data = fp.read(sample.size) + fp.seek(sample.raw_sample.offset, io.SEEK_SET) + data = fp.read(sample.raw_sample.size) box = CAMMSampleData.parse(data) if box.type == CAMMType.MIN_GPS.value: return geo.Point( - time=sample.time_offset, + time=sample.exact_time, lat=box.data[0], lon=box.data[1], alt=box.data[2], @@ -97,7 +93,7 @@ def _parse_point_from_sample( # Not using box.data.time_gps_epoch as the point timestamp # because it is from another clock return geo.Point( - time=sample.time_offset, + time=sample.exact_time, lat=box.data.latitude, lon=box.data.longitude, alt=box.data.altitude, @@ -148,15 +144,8 @@ def elst_entry_to_seconds( return (media_time, duration) -def _extract_camm_samples( - s: T.BinaryIO, - maxsize: int = -1, -) -> T.Generator[sample_parser.Sample, None, None]: - samples = sample_parser.parse_samples_from_trak(s, maxsize=maxsize) - camm_samples = ( - sample for sample in samples if sample.description["format"] == b"camm" - ) - yield from camm_samples +def _is_camm_description(description: T.Dict) -> bool: + return description["format"] == b"camm" def extract_points(fp: T.BinaryIO) -> T.Optional[T.List[geo.Point]]: @@ -166,59 +155,37 @@ def extract_points(fp: T.BinaryIO) -> T.Optional[T.List[geo.Point]]: """ points = None - movie_timescale = None - media_timescale = None - elst_entries = None - for h, s in parser.parse_path(fp, [b"moov", [b"mvhd", b"trak"]]): - if h.type == b"trak": - trak_start_offset = s.tell() - - descriptions = sample_parser.parse_descriptions_from_trak( - s, maxsize=h.maxsize + moov = sample_parser.MovieBoxParser.parse_stream(fp) + for track in moov.extract_tracks(): + descriptions = track.extract_sample_descriptions() + if any(_is_camm_description(d) for d in descriptions): + maybe_points = ( + _parse_point_from_sample(fp, sample) + for sample in track.extract_samples() + if _is_camm_description(sample.description) ) - camm_descriptions = [d for d in descriptions if d["format"] == b"camm"] - if camm_descriptions: - s.seek(trak_start_offset, io.SEEK_SET) - camm_samples = _extract_camm_samples(s, h.maxsize) - - points_with_nones = ( - _parse_point_from_sample(fp, sample) - for sample in camm_samples - if sample.description["format"] == b"camm" - ) - - points = [p for p in points_with_nones if p is not None] - if points: - s.seek(trak_start_offset) - elst_data = parser.parse_box_data_first( - s, [b"edts", b"elst"], maxsize=h.maxsize - ) - if elst_data is not None: - elst_entries = cparser.EditBox.parse(elst_data)["entries"] - - s.seek(trak_start_offset) - mdhd_data = parser.parse_box_data_firstx( - s, [b"mdia", b"mdhd"], maxsize=h.maxsize - ) - mdhd = cparser.MediaHeaderBox.parse(mdhd_data) - media_timescale = mdhd["timescale"] - else: - assert h.type == b"mvhd" - if not movie_timescale: - mvhd = cparser.MovieHeaderBox.parse(s.read(h.maxsize)) - movie_timescale = mvhd["timescale"] - - # exit when both found - if movie_timescale is not None and points: - break - - if points and movie_timescale and media_timescale and elst_entries: - segments = [ - elst_entry_to_seconds(entry, movie_timescale, media_timescale) - for entry in elst_entries - ] - points = list(filter_points_by_elst(points, segments)) + points = [p for p in maybe_points if p is not None] + if points: + elst_boxdata = track.extract_elst_boxdata() + if elst_boxdata is not None: + elst_entries = elst_boxdata["entries"] + if elst_entries: + # media_timescale + mdhd_boxdata = track.extract_mdhd_boxdata() + media_timescale = mdhd_boxdata["timescale"] + # movie_timescale + mvhd_boxdata = moov.extract_mvhd_boxdata() + movie_timescale = mvhd_boxdata["timescale"] + segments = [ + elst_entry_to_seconds( + entry, + movie_timescale=movie_timescale, + media_timescale=media_timescale, + ) + for entry in elst_entries + ] + points = list(filter_points_by_elst(points, segments)) return points @@ -238,7 +205,7 @@ def parse_gpx(path: pathlib.Path) -> T.List[geo.Point]: ) -def _decode_quietly(data: bytes, h: parser.Header) -> str: +def _decode_quietly(data: bytes, h: sparser.Header) -> str: try: return data.decode("utf-8") except UnicodeDecodeError: @@ -246,7 +213,7 @@ def _decode_quietly(data: bytes, h: parser.Header) -> str: return "" -def _parse_quietly(data: bytes, h: parser.Header) -> bytes: +def _parse_quietly(data: bytes, h: sparser.Header) -> bytes: try: parsed = MakeOrModel.parse(data) except C.ConstructError: @@ -256,7 +223,7 @@ def _parse_quietly(data: bytes, h: parser.Header) -> bytes: def extract_camera_make_and_model(fp: T.BinaryIO) -> T.Tuple[str, str]: - header_and_stream = parser.parse_path( + header_and_stream = sparser.parse_path( fp, [ b"moov", @@ -296,7 +263,7 @@ def extract_camera_make_and_model(fp: T.BinaryIO) -> T.Tuple[str, str]: # quit when both found if make and model: break - except parser.ParsingError: + except sparser.ParsingError: pass if make: diff --git a/mapillary_tools/geotag/geotag_videos_from_video.py b/mapillary_tools/geotag/geotag_videos_from_video.py index 77be8c6f..d1d31c0d 100644 --- a/mapillary_tools/geotag/geotag_videos_from_video.py +++ b/mapillary_tools/geotag/geotag_videos_from_video.py @@ -12,9 +12,9 @@ camm_parser, gpmf_gps_filter, gpmf_parser, - simple_mp4_parser as parser, utils as video_utils, ) +from ..mp4 import simple_mp4_parser as sparser from .geotag_from_generic import GeotagVideosFromGeneric LOG = logging.getLogger(__name__) @@ -77,7 +77,7 @@ def _extract_video_metadata( with video_path.open("rb") as fp: try: points = camm_parser.extract_points(fp) - except parser.ParsingError: + except sparser.ParsingError: points = None if points is not None: @@ -100,7 +100,7 @@ def _extract_video_metadata( with video_path.open("rb") as fp: try: points_with_fix = gpmf_parser.extract_points(fp) - except parser.ParsingError: + except sparser.ParsingError: points_with_fix = None if points_with_fix is not None: @@ -123,7 +123,7 @@ def _extract_video_metadata( with video_path.open("rb") as fp: try: points = blackvue_parser.extract_points(fp) - except parser.ParsingError: + except sparser.ParsingError: points = None if points is not None: diff --git a/mapillary_tools/geotag/gpmf_parser.py b/mapillary_tools/geotag/gpmf_parser.py index c5b4945b..7feaf713 100644 --- a/mapillary_tools/geotag/gpmf_parser.py +++ b/mapillary_tools/geotag/gpmf_parser.py @@ -5,7 +5,7 @@ import construct as C from .. import geo -from . import mp4_sample_parser as sample_parser, simple_mp4_parser as parser +from ..mp4 import mp4_sample_parser as sample_parser """ Parsing GPS from GPMF data format stored in GoPros. See the GPMF spec: https://github.com/gopro/gpmf-parser @@ -304,8 +304,8 @@ def _extract_dvnm_from_samples( dvnm_by_dvid: T.Dict[int, bytes] = {} for sample in samples: - fp.seek(sample.offset, io.SEEK_SET) - data = fp.read(sample.size) + fp.seek(sample.raw_sample.offset, io.SEEK_SET) + data = fp.read(sample.raw_sample.size) gpmf_sample_data = T.cast(T.Dict, GPMFSampleData.parse(data)) # iterate devices @@ -328,8 +328,8 @@ def _extract_points_from_samples( points_by_dvid: T.Dict[int, T.List[geo.PointWithFix]] = {} for sample in samples: - fp.seek(sample.offset, io.SEEK_SET) - data = fp.read(sample.size) + fp.seek(sample.raw_sample.offset, io.SEEK_SET) + data = fp.read(sample.raw_sample.size) gpmf_sample_data = T.cast(T.Dict, GPMFSampleData.parse(data)) # iterate devices @@ -338,9 +338,9 @@ def _extract_points_from_samples( sample_points = _find_first_gps_stream(device["data"]) if sample_points: # interpolate timestamps in between - avg_timedelta = sample.timedelta / len(sample_points) + avg_timedelta = sample.exact_timedelta / len(sample_points) for idx, point in enumerate(sample_points): - point.time = sample.time_offset + avg_timedelta * idx + point.time = sample.exact_time + avg_timedelta * idx device_id = _find_first_device_id(device["data"]) device_points = points_by_dvid.setdefault(device_id, []) @@ -350,18 +350,25 @@ def _extract_points_from_samples( return values[0] if values else [] +def _is_gpmd_description(description: T.Dict) -> bool: + return description["format"] == b"gpmd" + + def extract_points(fp: T.BinaryIO) -> T.Optional[T.List[geo.PointWithFix]]: """ Return a list of points (could be empty) if it is a valid GoPro video, otherwise None """ points = None - for h, s in parser.parse_path(fp, [b"moov", b"trak"]): - trak_start_offset = s.tell() - descriptions = _extract_gpmd_descriptions_from_trak(s, h.maxsize) - if descriptions: - s.seek(trak_start_offset, io.SEEK_SET) - gpmd_samples = _extract_gpmd_samples_from_trak(s, h.maxsize) + moov = sample_parser.MovieBoxParser.parse_stream(fp) + for track in moov.extract_tracks(): + descriptions = track.extract_sample_descriptions() + if any(_is_gpmd_description(d) for d in descriptions): + gpmd_samples = ( + sample + for sample in track.extract_samples() + if _is_gpmd_description(sample.description) + ) points = list(_extract_points_from_samples(fp, gpmd_samples)) # return the firstly found non-empty points if points: @@ -370,35 +377,19 @@ def extract_points(fp: T.BinaryIO) -> T.Optional[T.List[geo.PointWithFix]]: return points -def _extract_gpmd_descriptions_from_trak( - s: T.BinaryIO, - maxsize: int = -1, -): - descriptions = sample_parser.parse_descriptions_from_trak(s, maxsize=maxsize) - return [d for d in descriptions if d["format"] == b"gpmd"] - - -def _extract_gpmd_samples_from_trak( - s: T.BinaryIO, - maxsize: int = -1, -) -> T.Generator[sample_parser.Sample, None, None]: - trak_start_offset = s.tell() - gpmd_descriptions = _extract_gpmd_descriptions_from_trak(s, maxsize=maxsize) - if gpmd_descriptions: - s.seek(trak_start_offset, io.SEEK_SET) - samples = sample_parser.parse_samples_from_trak(s, maxsize=maxsize) - gpmd_samples = ( - sample for sample in samples if sample.description["format"] == b"gpmd" - ) - yield from gpmd_samples - - def extract_all_device_names(fp: T.BinaryIO) -> T.Dict[int, bytes]: - for h, s in parser.parse_path(fp, [b"moov", b"trak"]): - gpmd_samples = _extract_gpmd_samples_from_trak(s, h.maxsize) - device_names = _extract_dvnm_from_samples(fp, gpmd_samples) - if device_names: - return device_names + moov = sample_parser.MovieBoxParser.parse_stream(fp) + for track in moov.extract_tracks(): + descriptions = track.extract_sample_descriptions() + if any(_is_gpmd_description(d) for d in descriptions): + gpmd_samples = ( + sample + for sample in track.extract_samples() + if _is_gpmd_description(sample.description) + ) + device_names = _extract_dvnm_from_samples(fp, gpmd_samples) + if device_names: + return device_names return {} @@ -439,12 +430,3 @@ def parse_gpx(path: pathlib.Path) -> T.List[geo.PointWithFix]: if points is None: return [] return points - - -def iterate_gpmd_sample_data(fp: T.BinaryIO) -> T.Generator[T.Dict, None, None]: - for h, s in parser.parse_path(fp, [b"moov", b"trak"]): - gpmd_samples = _extract_gpmd_samples_from_trak(s, h.maxsize) - for sample in gpmd_samples: - fp.seek(sample.offset, io.SEEK_SET) - data = fp.read(sample.size) - yield T.cast(T.Dict, GPMFSampleData.parse(data)) diff --git a/mapillary_tools/geotag/simple_mp4_builder.py b/mapillary_tools/geotag/simple_mp4_builder.py index a718c6a9..6946f310 100644 --- a/mapillary_tools/geotag/simple_mp4_builder.py +++ b/mapillary_tools/geotag/simple_mp4_builder.py @@ -2,14 +2,25 @@ import io import typing as T -from . import ( +from . import io_utils +from ..mp4 import ( construct_mp4_parser as cparser, - io_utils, mp4_sample_parser as sample_parser, - simple_mp4_parser as parser, + simple_mp4_parser as sparser, ) -from .construct_mp4_parser import BoxDict -from .mp4_sample_parser import RawSample +from ..mp4.construct_mp4_parser import BoxDict +from ..mp4.mp4_sample_parser import RawSample + +""" +Variable naming conventions: + +- *_box: a BoxDict +- *_children: a list of child BoxDicts under the parent box +- *_boxdata: BoxDict["data"] +- *_data: the data in bytes of a box (without the header (type and size)) +- *_typed_data: the data in bytes of a box (with the header (type and size)) +""" + UINT32_MAX = 2**32 - 1 UINT64_MAX = 2**64 - 1 @@ -128,6 +139,7 @@ def _build_stts(sample_deltas: T.Iterable[int]) -> BoxDict: class _CompressedSampleCompositionOffset: __slots__ = ("sample_count", "sample_offset") # make sure dataclasses.asdict() produce the result as CompositionTimeToSampleBox expects + # SO DO NOT RENAME THE PROPERTIES BELOW sample_count: int sample_offset: int @@ -225,7 +237,7 @@ def _update_all_trak_tkhd(moov_chilren: T.Sequence[BoxDict]) -> None: ) -def _update_sbtl(trak: BoxDict, sample_offset: int) -> int: +def _update_sbtl_sample_offsets(trak: BoxDict, sample_offset: int) -> int: assert trak["type"] == b"trak" # new samples with offsets updated @@ -243,14 +255,13 @@ def _update_sbtl(trak: BoxDict, sample_offset: int) -> int: ) sample_offset += sample.size stbl_box = cparser.find_box_at_pathx(trak, [b"trak", b"mdia", b"minf", b"stbl"]) - descriptions, _ = sample_parser.parse_raw_samples_from_stbl( - io.BytesIO(T.cast(bytes, stbl_box["data"])) + descriptions, _ = sample_parser.extract_raw_samples_from_stbl_data( + T.cast(bytes, stbl_box["data"]) ) stbl_children_boxes = build_stbl_from_raw_samples( descriptions, repositioned_samples ) - new_stbl_bytes = _STBLChildrenBuilderConstruct.build_boxlist(stbl_children_boxes) - stbl_box["data"] = new_stbl_bytes + stbl_box["data"] = _STBLChildrenBuilderConstruct.build_boxlist(stbl_children_boxes) return sample_offset @@ -263,13 +274,13 @@ def iterate_samples( stbl_box = cparser.find_box_at_pathx( box, [b"trak", b"mdia", b"minf", b"stbl"] ) - _, raw_samples_iter = sample_parser.parse_raw_samples_from_stbl( - io.BytesIO(T.cast(bytes, stbl_box["data"])) + _, raw_samples_iter = sample_parser.extract_raw_samples_from_stbl_data( + T.cast(bytes, stbl_box["data"]) ) yield from raw_samples_iter -def _build_mdat_header_bytes(mdat_size: int) -> bytes: +def _build_mdat_header_data(mdat_size: int) -> bytes: if UINT32_MAX < mdat_size + 8: return cparser.BoxHeader64.build( { @@ -302,7 +313,7 @@ def find_movie_timescale(moov_children: T.Sequence[BoxDict]) -> int: return T.cast(T.Dict, mvhd["data"])["timescale"] -def _build_moov_bytes(moov_children: T.Sequence[BoxDict]) -> bytes: +def _build_moov_typed_data(moov_children: T.Sequence[BoxDict]) -> bytes: return cparser.MP4WithoutSTBLBuilderConstruct.build_box( { "type": b"moov", @@ -324,62 +335,77 @@ def transform_mp4( ) -> io_utils.ChainedIO: # extract ftyp src_fp.seek(0) - source_ftyp_box_data = parser.parse_mp4_data_firstx(src_fp, [b"ftyp"]) - source_ftyp_data = cparser.MP4WithoutSTBLBuilderConstruct.build_box( - {"type": b"ftyp", "data": source_ftyp_box_data} - ) + ftyp_data = sparser.parse_mp4_data_firstx(src_fp, [b"ftyp"]) # extract moov src_fp.seek(0) - src_moov_data = parser.parse_mp4_data_firstx(src_fp, [b"moov"]) - moov_children = _MOOVChildrenParserConstruct.parse_boxlist(src_moov_data) + moov_data = sparser.parse_mp4_data_firstx(src_fp, [b"moov"]) + moov_children = _MOOVChildrenParserConstruct.parse_boxlist(moov_data) # filter tracks in moov moov_children = list(_filter_moov_children_boxes(moov_children)) # extract video samples source_samples = list(iterate_samples(moov_children)) - movie_sample_readers = [ + sample_readers: T.List[io.IOBase] = [ io_utils.SlicedIO(src_fp, sample.offset, sample.size) for sample in source_samples ] if sample_generator is not None: - sample_readers = list(sample_generator(src_fp, moov_children)) - else: - sample_readers = [] + sample_readers.extend(sample_generator(src_fp, moov_children)) _update_all_trak_tkhd(moov_children) - # moov_boxes should be immutable since here + return build_mp4(ftyp_data, moov_children, sample_readers) + + +def build_mp4( + ftyp_data: bytes, + moov_children: T.Sequence[BoxDict], + sample_readers: T.Iterable[io.IOBase], +) -> io_utils.ChainedIO: + ftyp_typed_data = cparser.MP4WithoutSTBLBuilderConstruct.build_box( + {"type": b"ftyp", "data": ftyp_data} + ) mdat_body_size = sum(sample.size for sample in iterate_samples(moov_children)) + # moov_children should be immutable since here + new_moov_typed_data = _rewrite_and_build_moov_typed_data( + len(ftyp_typed_data), moov_children + ) return io_utils.ChainedIO( [ - io.BytesIO(source_ftyp_data), - io.BytesIO(_rewrite_moov(len(source_ftyp_data), moov_children)), - io.BytesIO(_build_mdat_header_bytes(mdat_body_size)), - *movie_sample_readers, + # ftyp + io.BytesIO(ftyp_typed_data), + # moov + io.BytesIO(new_moov_typed_data), + # mdat + io.BytesIO(_build_mdat_header_data(mdat_body_size)), *sample_readers, ] ) -def _rewrite_moov(moov_offset: int, moov_boxes: T.Sequence[BoxDict]) -> bytes: +def _rewrite_and_build_moov_typed_data( + moov_offset: int, moov_children: T.Sequence[BoxDict] +) -> bytes: # build moov for calculating moov size sample_offset = 0 - for box in _filter_trak_boxes(moov_boxes): - sample_offset = _update_sbtl(box, sample_offset) - moov_data = _build_moov_bytes(moov_boxes) - moov_data_size = len(moov_data) + for box in _filter_trak_boxes(moov_children): + sample_offset = _update_sbtl_sample_offsets(box, sample_offset) + moov_typed_data = _build_moov_typed_data(moov_children) + moov_typed_data_size = len(moov_typed_data) # mdat header size - mdat_body_size = sum(sample.size for sample in iterate_samples(moov_boxes)) - mdat_header = _build_mdat_header_bytes(mdat_body_size) + mdat_body_size = sum(sample.size for sample in iterate_samples(moov_children)) + mdat_header_data = _build_mdat_header_data(mdat_body_size) # build moov for real - sample_offset = moov_offset + len(moov_data) + len(mdat_header) - for box in _filter_trak_boxes(moov_boxes): - sample_offset = _update_sbtl(box, sample_offset) - moov_data = _build_moov_bytes(moov_boxes) - assert len(moov_data) == moov_data_size, f"{len(moov_data)} != {moov_data_size}" - - return moov_data + sample_offset = moov_offset + len(moov_typed_data) + len(mdat_header_data) + for box in _filter_trak_boxes(moov_children): + sample_offset = _update_sbtl_sample_offsets(box, sample_offset) + moov_typed_data = _build_moov_typed_data(moov_children) + assert ( + len(moov_typed_data) == moov_typed_data_size + ), f"{len(moov_typed_data)} != {moov_typed_data_size}" + + return moov_typed_data diff --git a/mapillary_tools/mp4/__init__.py b/mapillary_tools/mp4/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/mapillary_tools/geotag/construct_mp4_parser.py b/mapillary_tools/mp4/construct_mp4_parser.py similarity index 94% rename from mapillary_tools/geotag/construct_mp4_parser.py rename to mapillary_tools/mp4/construct_mp4_parser.py index f8b47c06..8ca1454b 100644 --- a/mapillary_tools/geotag/construct_mp4_parser.py +++ b/mapillary_tools/mp4/construct_mp4_parser.py @@ -441,12 +441,6 @@ def parse_box(self, data: bytes) -> BoxDict: def parse_boxlist(self, data: bytes) -> T.List[BoxDict]: return T.cast(T.List[BoxDict], self.BoxList.parse(data)) - def build_box(self, box: BoxDict) -> bytes: - return self.Box.build(box) - - def build_boxlist(self, boxes: T.Sequence[BoxDict]) -> bytes: - return self.BoxList.build(boxes) - class Box32ConstructBuilder(Box64ConstructBuilder): """ @@ -467,6 +461,18 @@ def Box(self) -> C.Construct: return self._box + def parse_box(self, data: bytes) -> BoxDict: + raise NotImplementedError("Box32ConstructBuilder does not support parsing") + + def parse_boxlist(self, data: bytes) -> T.List[BoxDict]: + raise NotImplementedError("Box32ConstructBuilder does not support parsing") + + def build_box(self, box: BoxDict) -> bytes: + return self.Box.build(box) + + def build_boxlist(self, boxes: T.Sequence[BoxDict]) -> bytes: + return self.BoxList.build(boxes) + # pyre-ignore[9]: pyre does not support recursive type SwitchMapType CMAP: SwitchMapType = { @@ -580,8 +586,17 @@ def _new_cmap_without_boxes( def find_box_at_pathx( box: T.Union[T.Sequence[BoxDict], BoxDict], path: T.Sequence[bytes] ) -> BoxDict: - if not path: + found = find_box_at_path(box, path) + if found is None: raise ValueError(f"box at path {path} not found") + return found + + +def find_box_at_path( + box: T.Union[T.Sequence[BoxDict], BoxDict], path: T.Sequence[bytes] +) -> T.Optional[BoxDict]: + if not path: + return None boxes: T.Sequence[BoxDict] if isinstance(box, dict): @@ -593,12 +608,13 @@ def find_box_at_pathx( if box["type"] == path[0]: if len(path) == 1: return box - else: - box_data = T.cast(T.Sequence[BoxDict], box["data"]) - # ListContainer from construct is not sequence - assert isinstance( - box_data, T.Sequence - ), f"expect a list of boxes but got {type(box_data)} at path {path}" - return find_box_at_pathx(box_data, path[1:]) - - raise ValueError(f"box at path {path} not found") + box_data = T.cast(T.Sequence[BoxDict], box["data"]) + # ListContainer from construct is not sequence + assert isinstance( + box_data, T.Sequence + ), f"expect a list of boxes but got {type(box_data)} at path {path}" + found = find_box_at_path(box_data, path[1:]) + if found is not None: + return found + + return None diff --git a/mapillary_tools/geotag/mp4_sample_parser.py b/mapillary_tools/mp4/mp4_sample_parser.py similarity index 50% rename from mapillary_tools/geotag/mp4_sample_parser.py rename to mapillary_tools/mp4/mp4_sample_parser.py index 4c90e0a3..1cebd682 100644 --- a/mapillary_tools/geotag/mp4_sample_parser.py +++ b/mapillary_tools/mp4/mp4_sample_parser.py @@ -1,55 +1,46 @@ import datetime -import io import typing as T from pathlib import Path -from . import construct_mp4_parser as cparser, simple_mp4_parser as parser +from . import construct_mp4_parser as cparser, simple_mp4_parser as sparser class RawSample(T.NamedTuple): # 1-based index description_idx: int - # sample offset + + # sample offset (offset from the beginning of the file) offset: int - # sample size + + # sample size (in bytes) size: int - # sample_delta read from stts entries, + + # sample_delta read from stts entries that decides when to decode the sample, # i.e. STTS(n) in the forumula DT(n+1) = DT(n) + STTS(n) + # NOTE: timescale is not applied yet (hence int) timedelta: int - # sample composition offset, + + # sample composition offset that decides when to present the sample, # i.e. CTTS(n) in the forumula CT(n) = DT(n) + CTTS(n). + # NOTE: timescale is not applied yet (hence int) composition_offset: int + # if it is a sync sample is_sync: bool -# TODO: can not inherit RawSample? class Sample(T.NamedTuple): - # copied from RawSample + raw_sample: RawSample - # 1-based index - description_idx: int - # sample offset - offset: int - # sample size - size: int - # sample delta in seconds read from stts entries, - # i.e. (STTS(n) / timescale) in the forumula DT(n+1) = DT(n) + STTS(n) - timedelta: float - # sample composition offset in seconds, - # i.e. (CTTS(n) / timescale) in the forumula CT(n) = DT(n) + CTTS(n). - composition_offset: float - # if it is a sync sample - is_sync: bool + # accumulated timedelta in seconds, i.e. DT(n) / timescale + exact_time: float + + # accumulated composition timedelta in seconds, i.e. CT(n) / timescale + exact_composition_time: float - # extended fields below + # exact timedelta in seconds, i.e. STTS(n) / timescale + exact_timedelta: float - # accumulated sample_delta in seconds, - # i.e. (DT(n) / timescale) in the forumula DT(n+1) = DT(n) + STTS(n) - time_offset: T.Union[int, float] - # accumulated composition offset in seconds, - # i.e. (CT(n) / timescale) in the forumula CT(n) = DT(n) + CTTS(n). - composition_time_offset: T.Union[int, float] # reference to the sample description description: T.Dict @@ -138,109 +129,28 @@ def _extract_raw_samples( def _extract_samples( raw_samples: T.Iterator[RawSample], descriptions: T.List, + timescale: int, ) -> T.Generator[Sample, None, None]: acc_delta = 0 for raw_sample in raw_samples: yield Sample( - description_idx=raw_sample.description_idx, - offset=raw_sample.offset, - size=raw_sample.size, - timedelta=raw_sample.timedelta, - composition_offset=raw_sample.composition_offset, - is_sync=raw_sample.is_sync, + raw_sample=raw_sample, description=descriptions[raw_sample.description_idx - 1], - time_offset=acc_delta, + exact_time=acc_delta / timescale, + exact_timedelta=raw_sample.timedelta / timescale, # CT(n) = DT(n) + CTTS(n) - composition_time_offset=(acc_delta + raw_sample.composition_offset), + exact_composition_time=(acc_delta + raw_sample.composition_offset) + / timescale, ) acc_delta += raw_sample.timedelta -def _apply_timescale(sample: Sample, media_timescale: int) -> Sample: - return Sample( - description_idx=sample.description_idx, - offset=sample.offset, - size=sample.size, - timedelta=sample.timedelta / media_timescale, - composition_offset=sample.composition_offset / media_timescale, - is_sync=sample.is_sync, - description=sample.description, - time_offset=sample.time_offset / media_timescale, - composition_time_offset=sample.composition_time_offset / media_timescale, - ) - - -def parse_raw_samples_from_stbl( - stbl: T.BinaryIO, - maxsize: int = -1, -) -> T.Tuple[T.List[T.Dict], T.Generator[RawSample, None, None]]: - """ - DEPRECATED: use parse_raw_samples_from_stbl_bytes instead - """ - - descriptions = [] - sizes = [] - chunk_offsets = [] - chunk_entries = [] - timedeltas: T.List[int] = [] - composition_offsets: T.Optional[T.List[int]] = None - syncs: T.Optional[T.Set[int]] = None - - for h, s in parser.parse_boxes(stbl, maxsize=maxsize, extend_eof=False): - if h.type == b"stsd": - box = cparser.SampleDescriptionBox.parse(s.read(h.maxsize)) - descriptions = list(box.entries) - elif h.type == b"stsz": - box = cparser.SampleSizeBox.parse(s.read(h.maxsize)) - if box.sample_size == 0: - sizes = list(box.entries) - else: - sizes = [box.sample_size for _ in range(box.sample_count)] - elif h.type == b"stco": - box = cparser.ChunkOffsetBox.parse(s.read(h.maxsize)) - chunk_offsets = list(box.entries) - elif h.type == b"co64": - box = cparser.ChunkLargeOffsetBox.parse(s.read(h.maxsize)) - chunk_offsets = list(box.entries) - elif h.type == b"stsc": - box = cparser.SampleToChunkBox.parse(s.read(h.maxsize)) - chunk_entries = list(box.entries) - elif h.type == b"stts": - timedeltas = [] - box = cparser.TimeToSampleBox.parse(s.read(h.maxsize)) - for entry in box.entries: - for _ in range(entry.sample_count): - timedeltas.append(entry.sample_delta) - elif h.type == b"ctts": - composition_offsets = [] - box = cparser.CompositionTimeToSampleBox.parse(s.read(h.maxsize)) - for entry in box.entries: - for _ in range(entry.sample_count): - composition_offsets.append(entry.sample_offset) - elif h.type == b"stss": - box = cparser.SyncSampleBox.parse(s.read(h.maxsize)) - syncs = set(box.entries) - - # some stbl have less timedeltas than the sample count i.e. len(sizes), - # in this case append 0's to timedeltas - while len(timedeltas) < len(sizes): - timedeltas.append(0) - if composition_offsets is not None: - while len(composition_offsets) < len(sizes): - composition_offsets.append(0) - - raw_samples = _extract_raw_samples( - sizes, chunk_entries, chunk_offsets, timedeltas, composition_offsets, syncs - ) - return descriptions, raw_samples - - STBLBoxlistConstruct = cparser.Box64ConstructBuilder( T.cast(cparser.SwitchMapType, cparser.CMAP[b"stbl"]) ).BoxList -def parse_raw_samples_from_stbl_bytes( +def extract_raw_samples_from_stbl_data( stbl: bytes, ) -> T.Tuple[T.List[T.Dict], T.Generator[RawSample, None, None]]: descriptions = [] @@ -251,9 +161,11 @@ def parse_raw_samples_from_stbl_bytes( composition_offsets: T.Optional[T.List[int]] = None syncs: T.Optional[T.Set[int]] = None - stbl_boxes = T.cast(T.Sequence[cparser.BoxDict], STBLBoxlistConstruct.parse(stbl)) + stbl_children = T.cast( + T.Sequence[cparser.BoxDict], STBLBoxlistConstruct.parse(stbl) + ) - for box in stbl_boxes: + for box in stbl_children: data: T.Dict = T.cast(T.Dict, box["data"]) if box["type"] == b"stsd": @@ -296,124 +208,108 @@ def parse_raw_samples_from_stbl_bytes( return descriptions, raw_samples -def parse_descriptions_from_trak(trak: T.BinaryIO, maxsize: int = -1) -> T.List[T.Dict]: - data = parser.parse_box_data_first( - trak, [b"mdia", b"minf", b"stbl", b"stsd"], maxsize=maxsize - ) - if data is None: - return [] - box = cparser.SampleDescriptionBox.parse(data) - return list(box.entries) - - -def parse_samples_from_trak( - trak: T.BinaryIO, - maxsize: int = -1, -) -> T.Generator[Sample, None, None]: - trak_start_offset = trak.tell() - - trak.seek(trak_start_offset, io.SEEK_SET) - mdhd_box = parser.parse_box_data_firstx(trak, [b"mdia", b"mdhd"], maxsize=maxsize) - mdhd = T.cast(T.Dict, cparser.MediaHeaderBox.parse(mdhd_box)) - - trak.seek(trak_start_offset, io.SEEK_SET) - h, s = parser.parse_box_path_firstx( - trak, [b"mdia", b"minf", b"stbl"], maxsize=maxsize - ) - descriptions, raw_samples = parse_raw_samples_from_stbl(s, maxsize=h.maxsize) - - yield from ( - _apply_timescale(s, mdhd["timescale"]) - for s in _extract_samples(raw_samples, descriptions) - ) - - -STSDBoxListConstruct = cparser.Box64ConstructBuilder( +_STSDBoxListConstruct = cparser.Box64ConstructBuilder( # pyre-ignore[6]: pyre does not support recursive type SwitchMapType {b"stsd": cparser.CMAP[b"stsd"]} ).BoxList class TrackBoxParser: - trak_boxes: T.Sequence[cparser.BoxDict] + trak_children: T.Sequence[cparser.BoxDict] stbl_data: bytes - def __init__(self, trak_boxes: T.Sequence[cparser.BoxDict]): - self.trak_boxes = trak_boxes - stbl = cparser.find_box_at_pathx(self.trak_boxes, [b"mdia", b"minf", b"stbl"]) + def __init__(self, trak_children: T.Sequence[cparser.BoxDict]): + self.trak_children = trak_children + stbl = cparser.find_box_at_pathx( + self.trak_children, [b"mdia", b"minf", b"stbl"] + ) self.stbl_data = T.cast(bytes, stbl["data"]) - def tkhd(self) -> T.Dict: + def extract_tkhd_boxdata(self) -> T.Dict: return T.cast( - T.Dict, cparser.find_box_at_pathx(self.trak_boxes, [b"tkhd"])["data"] + T.Dict, cparser.find_box_at_pathx(self.trak_children, [b"tkhd"])["data"] ) def is_video_track(self) -> bool: - hdlr = cparser.find_box_at_pathx(self.trak_boxes, [b"mdia", b"hdlr"]) + hdlr = cparser.find_box_at_pathx(self.trak_children, [b"mdia", b"hdlr"]) return T.cast(T.Dict[str, T.Any], hdlr["data"])["handler_type"] == b"vide" - def parse_sample_description(self) -> T.Dict: - boxes = STSDBoxListConstruct.parse(self.stbl_data) + def extract_sample_descriptions(self) -> T.List[T.Dict]: + # TODO: return [] if parsing fail + boxes = _STSDBoxListConstruct.parse(self.stbl_data) stsd = cparser.find_box_at_pathx( T.cast(T.Sequence[cparser.BoxDict], boxes), [b"stsd"] ) - return T.cast(T.Dict, stsd["data"]) + return T.cast(T.List[T.Dict], T.cast(T.Dict, stsd["data"])["entries"]) + + def extract_elst_boxdata(self) -> T.Optional[T.Dict]: + box = cparser.find_box_at_path(self.trak_children, [b"edts", b"elst"]) + if box is None: + return None + return T.cast(T.Dict, box["data"]) + + def extract_mdhd_boxdata(self) -> T.Dict: + box = cparser.find_box_at_pathx(self.trak_children, [b"mdia", b"mdhd"]) + return T.cast(T.Dict, box["data"]) - def parse_raw_samples(self) -> T.Generator[RawSample, None, None]: - _, raw_samples = parse_raw_samples_from_stbl_bytes(self.stbl_data) + def extract_raw_samples(self) -> T.Generator[RawSample, None, None]: + _, raw_samples = extract_raw_samples_from_stbl_data(self.stbl_data) yield from raw_samples - def parse_samples(self) -> T.Generator[Sample, None, None]: - descriptions, raw_samples = parse_raw_samples_from_stbl_bytes(self.stbl_data) + def extract_samples(self) -> T.Generator[Sample, None, None]: + descriptions, raw_samples = extract_raw_samples_from_stbl_data(self.stbl_data) mdhd = T.cast( T.Dict, - cparser.find_box_at_pathx(self.trak_boxes, [b"mdia", b"mdhd"])["data"], - ) - yield from ( - _apply_timescale(s, mdhd["timescale"]) - for s in _extract_samples(raw_samples, descriptions) + cparser.find_box_at_pathx(self.trak_children, [b"mdia", b"mdhd"])["data"], ) + yield from _extract_samples(raw_samples, descriptions, mdhd["timescale"]) class MovieBoxParser: - moov_boxes: T.Sequence[cparser.BoxDict] + moov_children: T.Sequence[cparser.BoxDict] - def __init__(self, moov: bytes): - self.moov_boxes = T.cast( + def __init__(self, moov_data: bytes): + self.moov_children = T.cast( T.Sequence[cparser.BoxDict], - cparser.MOOVWithoutSTBLBuilderConstruct.BoxList.parse(moov), + cparser.MOOVWithoutSTBLBuilderConstruct.BoxList.parse(moov_data), ) @classmethod def parse_file(cls, video_path: Path) -> "MovieBoxParser": with video_path.open("rb") as fp: - moov = parser.parse_box_data_firstx(fp, [b"moov"]) + moov = sparser.parse_box_data_firstx(fp, [b"moov"]) + return MovieBoxParser(moov) + + @classmethod + def parse_stream(cls, stream: T.BinaryIO) -> "MovieBoxParser": + moov = sparser.parse_box_data_firstx(stream, [b"moov"]) return MovieBoxParser(moov) - def mvhd(self): - mvhd = cparser.find_box_at_pathx(self.moov_boxes, [b"mvhd"]) - return mvhd["data"] + def extract_mvhd_boxdata(self) -> T.Dict: + mvhd = cparser.find_box_at_pathx(self.moov_children, [b"mvhd"]) + return T.cast(T.Dict, mvhd["data"]) - def parse_tracks(self) -> T.Generator[TrackBoxParser, None, None]: - for box in self.moov_boxes: + def extract_tracks(self) -> T.Generator[TrackBoxParser, None, None]: + for box in self.moov_children: if box["type"] == b"trak": yield TrackBoxParser(T.cast(T.Sequence[cparser.BoxDict], box["data"])) - def parse_track_at(self, stream_idx: int) -> TrackBoxParser: + def extract_track_at(self, stream_idx: int) -> TrackBoxParser: """ stream_idx should be the stream_index specifier. See http://ffmpeg.org/ffmpeg.html#Stream-specifiers-1 > Stream numbering is based on the order of the streams as detected by libavformat """ - trak_boxes = [box for box in self.moov_boxes if box["type"] == b"trak"] + trak_boxes = [box for box in self.moov_children if box["type"] == b"trak"] if not (0 <= stream_idx < len(trak_boxes)): raise IndexError( "unable to read stream at %d from the track list (length %d)", stream_idx, len(trak_boxes), ) - return TrackBoxParser( - T.cast(T.Sequence[cparser.BoxDict], trak_boxes[stream_idx]["data"]) + trak_children = T.cast( + T.Sequence[cparser.BoxDict], trak_boxes[stream_idx]["data"] ) + return TrackBoxParser(trak_children) _DT_1904 = datetime.datetime.utcfromtimestamp(0).replace(year=1904) diff --git a/mapillary_tools/geotag/simple_mp4_parser.py b/mapillary_tools/mp4/simple_mp4_parser.py similarity index 100% rename from mapillary_tools/geotag/simple_mp4_parser.py rename to mapillary_tools/mp4/simple_mp4_parser.py diff --git a/mapillary_tools/sample_video.py b/mapillary_tools/sample_video.py index e70a370b..65d1baa7 100644 --- a/mapillary_tools/sample_video.py +++ b/mapillary_tools/sample_video.py @@ -9,7 +9,8 @@ from . import constants, exceptions, ffmpeg as ffmpeglib, geo, types, utils from .exif_write import ExifEdit -from .geotag import geotag_videos_from_video, mp4_sample_parser +from .geotag import geotag_videos_from_video +from .mp4 import mp4_sample_parser from .process_geotag_properties import GeotagSource LOG = logging.getLogger(__name__) @@ -234,10 +235,10 @@ def _sample_video_stream_by_distance( """ LOG.info("Extracting video samples") - sorted_samples = list(video_track_parser.parse_samples()) + sorted_samples = list(video_track_parser.extract_samples()) # we need sort sampels by composition time (CT) not the decoding offset (DT) # CT is the oder of videos streaming to audiences, as well as the order ffmpeg sampling - sorted_samples.sort(key=lambda sample: sample.composition_time_offset) + sorted_samples.sort(key=lambda sample: sample.exact_composition_time) LOG.info("Found total %d video samples", len(sorted_samples)) # interpolate sample points between the GPS track range (with 1ms buffer) @@ -251,11 +252,11 @@ def _sample_video_stream_by_distance( ( frame_idx_0based, video_sample, - interpolator.interpolate(video_sample.composition_time_offset), + interpolator.interpolate(video_sample.exact_composition_time), ) for frame_idx_0based, video_sample in enumerate(sorted_samples) if _within_track_time_range_buffered( - points, video_sample.composition_time_offset + points, video_sample.exact_composition_time ) ] LOG.info("Found total %d interpolated video samples", len(interp_sample_points)) @@ -316,7 +317,7 @@ def _sample_single_video_by_distance( LOG.info("Extracting video samples") video_stream_idx = video_stream["index"] moov_parser = mp4_sample_parser.MovieBoxParser.parse_file(video_path) - video_track_parser = moov_parser.parse_track_at(video_stream_idx) + video_track_parser = moov_parser.extract_track_at(video_stream_idx) sample_points_by_frame_idx = _sample_video_stream_by_distance( video_metadata.points, video_track_parser, sample_distance ) @@ -352,8 +353,8 @@ def _sample_single_video_by_distance( video_sample, interp = sample_points_by_frame_idx[sample_idx] assert ( - interp.time == video_sample.composition_time_offset - ), f"interpolated time {interp.time} should match the video sample time {video_sample.composition_time_offset}" + interp.time == video_sample.exact_composition_time + ), f"interpolated time {interp.time} should match the video sample time {video_sample.exact_composition_time}" timestamp = start_time + datetime.timedelta(seconds=interp.time) exif_edit = ExifEdit(sample_paths[0]) diff --git a/mapillary_tools/video_data_extraction/extractors/blackvue_parser.py b/mapillary_tools/video_data_extraction/extractors/blackvue_parser.py index 7f088677..9aef060f 100644 --- a/mapillary_tools/video_data_extraction/extractors/blackvue_parser.py +++ b/mapillary_tools/video_data_extraction/extractors/blackvue_parser.py @@ -1,8 +1,9 @@ import typing as T -from mapillary_tools import geo -from mapillary_tools.geotag import blackvue_parser, simple_mp4_parser -from mapillary_tools.video_data_extraction.extractors.base_parser import BaseParser +from ... import geo +from ...geotag import blackvue_parser +from ...mp4 import simple_mp4_parser as sparser +from .base_parser import BaseParser class BlackVueParser(BaseParser): @@ -21,7 +22,7 @@ def extract_points(self) -> T.Sequence[geo.Point]: points = blackvue_parser.extract_points(fp) or [] self.pointsFound = len(points) > 0 return points - except simple_mp4_parser.ParsingError: + except sparser.ParsingError: return [] def extract_make(self) -> T.Optional[str]: diff --git a/mapillary_tools/video_data_extraction/extractors/camm_parser.py b/mapillary_tools/video_data_extraction/extractors/camm_parser.py index 98e0b8d6..122a0ca5 100644 --- a/mapillary_tools/video_data_extraction/extractors/camm_parser.py +++ b/mapillary_tools/video_data_extraction/extractors/camm_parser.py @@ -1,9 +1,10 @@ import functools import typing as T -from mapillary_tools import geo -from mapillary_tools.geotag import camm_parser, simple_mp4_parser -from mapillary_tools.video_data_extraction.extractors.base_parser import BaseParser +from ... import geo +from ...geotag import camm_parser +from ...mp4 import simple_mp4_parser as sparser +from .base_parser import BaseParser class CammParser(BaseParser): @@ -23,7 +24,7 @@ def extract_points(self) -> T.Sequence[geo.Point]: with source_path.open("rb") as fp: try: return camm_parser.extract_points(fp) or [] - except simple_mp4_parser.ParsingError: + except sparser.ParsingError: return [] def extract_make(self) -> T.Optional[str]: diff --git a/mapillary_tools/video_data_extraction/extractors/gopro_parser.py b/mapillary_tools/video_data_extraction/extractors/gopro_parser.py index 3a4c3efd..77e488ad 100644 --- a/mapillary_tools/video_data_extraction/extractors/gopro_parser.py +++ b/mapillary_tools/video_data_extraction/extractors/gopro_parser.py @@ -1,8 +1,9 @@ import typing as T -from mapillary_tools import geo -from mapillary_tools.geotag import gpmf_parser, simple_mp4_parser -from mapillary_tools.video_data_extraction.extractors.base_parser import BaseParser +from ... import geo +from ...geotag import gpmf_parser +from ...mp4 import simple_mp4_parser as sparser +from .base_parser import BaseParser class GoProParser(BaseParser): @@ -21,7 +22,7 @@ def extract_points(self) -> T.Sequence[geo.Point]: points = gpmf_parser.extract_points(fp) or [] self.pointsFound = len(points) > 0 return points - except simple_mp4_parser.ParsingError: + except sparser.ParsingError: return [] def extract_make(self) -> T.Optional[str]: diff --git a/setup.py b/setup.py index 74b9a348..2d09b295 100644 --- a/setup.py +++ b/setup.py @@ -46,6 +46,7 @@ def readme(): "mapillary_tools", "mapillary_tools.commands", "mapillary_tools.geotag", + "mapillary_tools.mp4", "mapillary_tools.video_data_extraction", "mapillary_tools.video_data_extraction.extractors", ], diff --git a/tests/cli/simple_mp4_parser.py b/tests/cli/simple_mp4_parser.py index 3fd6ae52..da8e3f29 100644 --- a/tests/cli/simple_mp4_parser.py +++ b/tests/cli/simple_mp4_parser.py @@ -6,10 +6,10 @@ import typing as T from mapillary_tools import utils -from mapillary_tools.geotag import ( +from mapillary_tools.mp4 import ( construct_mp4_parser as cparser, mp4_sample_parser as sample_parser, - simple_mp4_parser as parser, + simple_mp4_parser as sparser, ) LOG = logging.getLogger(__name__) @@ -37,13 +37,15 @@ def _validate_samples( samples: T.List[sample_parser.RawSample] = [] with open(path, "rb") as fp: - for h, s in parser.parse_path( + for h, s in sparser.parse_path( fp, [b"moov", b"trak", b"mdia", b"minf", b"stbl"] ): ( descriptions, raw_samples, - ) = sample_parser.parse_raw_samples_from_stbl(s, maxsize=h.maxsize) + ) = sample_parser.parse_raw_samples_from_stbl_DEPRECATED( + s, maxsize=h.maxsize + ) samples.extend( sample for sample in raw_samples @@ -67,7 +69,7 @@ def _validate_samples( def _parse_structs(fp: T.BinaryIO): - for h, d, s in parser.parse_boxes_recursive(fp, box_list_types=box_list_types): + for h, d, s in sparser.parse_boxes_recursive(fp, box_list_types=box_list_types): margin = "\t" * d if h.size32 == 0: header = f"{str(h.type)} {h.box_size} (open-ended):" @@ -86,7 +88,7 @@ def _parse_structs(fp: T.BinaryIO): def _dump_box_data_at(fp: T.BinaryIO, box_type_path: T.List[bytes]): - for h, s in parser.parse_path(fp, box_type_path): + for h, s in sparser.parse_path(fp, box_type_path): max_chunk_size = 1024 read = 0 while read < h.maxsize or h.maxsize == -1: @@ -103,22 +105,26 @@ def _dump_box_data_at(fp: T.BinaryIO, box_type_path: T.List[bytes]): def _parse_samples(fp: T.BinaryIO, filters: T.Optional[T.Container[bytes]] = None): - for h, s in parser.parse_path(fp, [b"moov", b"trak"]): + for h, s in sparser.parse_path(fp, [b"moov", b"trak"]): offset = s.tell() - for h1, s1 in parser.parse_path(s, [b"mdia", b"mdhd"], maxsize=h.maxsize): + for h1, s1 in sparser.parse_path(s, [b"mdia", b"mdhd"], maxsize=h.maxsize): box = cparser.MediaHeaderBox.parse(s1.read(h.maxsize)) LOG.info(box) LOG.info(sample_parser.to_datetime(box.creation_time)) LOG.info(box.duration / box.timescale) s.seek(offset, io.SEEK_SET) - for sample in sample_parser.parse_samples_from_trak(s, maxsize=h.maxsize): + for sample in sample_parser.parse_samples_from_trak_DEPRECATED( + s, maxsize=h.maxsize + ): if filters is None or sample.description["format"] in filters: print(sample) def _dump_samples(fp: T.BinaryIO, filters: T.Optional[T.Container[bytes]] = None): - for h, s in parser.parse_path(fp, [b"moov", b"trak"]): - for sample in sample_parser.parse_samples_from_trak(s, maxsize=h.maxsize): + for h, s in sparser.parse_path(fp, [b"moov", b"trak"]): + for sample in sample_parser.parse_samples_from_trak_DEPRECATED( + s, maxsize=h.maxsize + ): if filters is None or sample.description["format"] in filters: fp.seek(sample.offset, io.SEEK_SET) data = fp.read(sample.size) @@ -203,13 +209,13 @@ def _process_path(parsed_args, path: pathlib.Path): if box_path is None: _parse_structs(fp) else: - data = parser.parse_mp4_data_firstx(fp, box_path) + data = sparser.parse_mp4_data_firstx(fp, box_path) _parse_structs(io.BytesIO(data)) elif parsed_args.full: if box_path is None: boxes = cparser.MP4ParserConstruct.BoxList.parse_stream(fp) else: - data = parser.parse_mp4_data_firstx(fp, box_path) + data = sparser.parse_mp4_data_firstx(fp, box_path) boxes = cparser.MP4ParserConstruct.BoxList.parse_stream( io.BytesIO(data) ) @@ -222,7 +228,7 @@ def _process_path(parsed_args, path: pathlib.Path): ) ) else: - data = parser.parse_mp4_data_firstx(fp, box_path) + data = sparser.parse_mp4_data_firstx(fp, box_path) boxes = ( cparser.MP4WithoutSTBLParserConstruct.BoxList.parse_stream( io.BytesIO(data) diff --git a/tests/unit/test_blackvue_parser.py b/tests/unit/test_blackvue_parser.py index 9ec65a45..0832a739 100644 --- a/tests/unit/test_blackvue_parser.py +++ b/tests/unit/test_blackvue_parser.py @@ -2,7 +2,8 @@ import mapillary_tools.geo as geo -from mapillary_tools.geotag import blackvue_parser, construct_mp4_parser as cparser +from mapillary_tools.geotag import blackvue_parser +from mapillary_tools.mp4 import construct_mp4_parser as cparser def test_parse_points(): diff --git a/tests/unit/test_camm_parser.py b/tests/unit/test_camm_parser.py index ca22b571..ed323783 100644 --- a/tests/unit/test_camm_parser.py +++ b/tests/unit/test_camm_parser.py @@ -7,9 +7,9 @@ from mapillary_tools.geotag import ( camm_builder, camm_parser, - construct_mp4_parser as cparser, simple_mp4_builder, ) +from mapillary_tools.mp4 import construct_mp4_parser as cparser def test_filter_points_by_edit_list(): diff --git a/tests/unit/test_mp4_sample_parser.py b/tests/unit/test_mp4_sample_parser.py index 003fae37..6e561fcb 100644 --- a/tests/unit/test_mp4_sample_parser.py +++ b/tests/unit/test_mp4_sample_parser.py @@ -1,19 +1,19 @@ from pathlib import Path -from mapillary_tools.geotag import mp4_sample_parser +from mapillary_tools.mp4 import mp4_sample_parser def test_movie_box_parser(): moov_parser = mp4_sample_parser.MovieBoxParser.parse_file( Path("tests/data/videos/sample-5s.mp4") ) - assert 2 == len(list(moov_parser.parse_tracks())) - video_track = moov_parser.parse_track_at(0) + assert 2 == len(list(moov_parser.extract_tracks())) + video_track = moov_parser.extract_track_at(0) assert video_track.is_video_track() - aac_track = moov_parser.parse_track_at(1) + aac_track = moov_parser.extract_track_at(1) assert not aac_track.is_video_track() - samples = list(video_track.parse_samples()) - raw_samples = list(video_track.parse_raw_samples()) + samples = list(video_track.extract_samples()) + raw_samples = list(video_track.extract_raw_samples()) assert 171 == len(samples) assert len(samples) == len(raw_samples) assert { @@ -31,7 +31,7 @@ def test_movie_box_parser(): "height": 70778880, } == { k: v - for k, v in video_track.tkhd().items() + for k, v in video_track.extract_tkhd_boxdata().items() if k in [ "version", @@ -47,8 +47,8 @@ def test_movie_box_parser(): "height", ] } - assert isinstance(video_track.tkhd(), dict) + assert isinstance(video_track.extract_tkhd_boxdata(), dict) for sample, raw_sample in zip(samples, raw_samples): - assert sample.offset == raw_sample.offset - assert sample.is_sync == raw_sample.is_sync - assert sample.size == raw_sample.size + assert sample.raw_sample.offset == raw_sample.offset + assert sample.raw_sample.is_sync == raw_sample.is_sync + assert sample.raw_sample.size == raw_sample.size diff --git a/tests/unit/test_simple_mp4_builder.py b/tests/unit/test_simple_mp4_builder.py index 88b00cad..8bd67e7d 100644 --- a/tests/unit/test_simple_mp4_builder.py +++ b/tests/unit/test_simple_mp4_builder.py @@ -2,10 +2,12 @@ import typing as T from mapillary_tools.geotag import ( + simple_mp4_builder as builder, +) +from mapillary_tools.mp4 import ( construct_mp4_parser as cparser, mp4_sample_parser as sample_parser, - simple_mp4_builder as builder, - simple_mp4_parser as parser, + simple_mp4_parser as sparser, ) @@ -44,9 +46,9 @@ def _build_and_parse_stbl( d = cparser.Box32ConstructBuilder({b"stbl": cparser.CMAP[b"stbl"]}).Box.build( {"type": b"stbl", "data": s} ) - ss = parser.parse_box_data_firstx(io.BytesIO(d), [b"stbl"]) + ss = sparser.parse_box_data_firstx(io.BytesIO(d), [b"stbl"]) assert d[8:] == ss - _, parsed_samples = sample_parser.parse_raw_samples_from_stbl(io.BytesIO(ss)) + _, parsed_samples = sample_parser.extract_raw_samples_from_stbl_data(ss) assert expected_samples == list(parsed_samples) @@ -247,80 +249,82 @@ def test_parse_raw_samples_from_stbl(): }, ] ) - descs, sample_iter = sample_parser.parse_raw_samples_from_stbl( - io.BytesIO(stbl_bytes) - ) - samples = list(sample_iter) - assert [ - sample_parser.RawSample( - description_idx=1, - offset=1, - size=1, - timedelta=20, - composition_offset=0, - is_sync=True, - ), - sample_parser.RawSample( - description_idx=1, - offset=2, - size=2, - timedelta=30, - composition_offset=0, - is_sync=False, - ), - sample_parser.RawSample( - description_idx=1, - offset=5, - size=3, - timedelta=30, - composition_offset=0, - is_sync=True, - ), - sample_parser.RawSample( - description_idx=1, - offset=8, - size=3, - timedelta=50, - composition_offset=0, - is_sync=False, - ), - ] == samples - d = builder.build_stbl_from_raw_samples(descs, samples) - assert d[1:] == [ - { - "data": { - "entries": [ - {"sample_count": 1, "sample_delta": 20}, - {"sample_count": 2, "sample_delta": 30}, - {"sample_count": 1, "sample_delta": 50}, - ] + + def _verify_samples(descs, samples): + assert [ + sample_parser.RawSample( + description_idx=1, + offset=1, + size=1, + timedelta=20, + composition_offset=0, + is_sync=True, + ), + sample_parser.RawSample( + description_idx=1, + offset=2, + size=2, + timedelta=30, + composition_offset=0, + is_sync=False, + ), + sample_parser.RawSample( + description_idx=1, + offset=5, + size=3, + timedelta=30, + composition_offset=0, + is_sync=True, + ), + sample_parser.RawSample( + description_idx=1, + offset=8, + size=3, + timedelta=50, + composition_offset=0, + is_sync=False, + ), + ] == samples + d = builder.build_stbl_from_raw_samples(descs, samples) + assert d[1:] == [ + { + "data": { + "entries": [ + {"sample_count": 1, "sample_delta": 20}, + {"sample_count": 2, "sample_delta": 30}, + {"sample_count": 1, "sample_delta": 50}, + ] + }, + "type": b"stts", }, - "type": b"stts", - }, - { - "data": { - "entries": [ - { - "first_chunk": 1, - "sample_description_index": 1, - "samples_per_chunk": 2, - }, - { - "first_chunk": 2, - "sample_description_index": 1, - "samples_per_chunk": 2, - }, - ] + { + "data": { + "entries": [ + { + "first_chunk": 1, + "sample_description_index": 1, + "samples_per_chunk": 2, + }, + { + "first_chunk": 2, + "sample_description_index": 1, + "samples_per_chunk": 2, + }, + ] + }, + "type": b"stsc", }, - "type": b"stsc", - }, - { - "data": {"entries": [1, 2, 3, 3], "sample_count": 4, "sample_size": 0}, - "type": b"stsz", - }, - {"data": {"entries": [1, 5]}, "type": b"co64"}, - {"data": {"entries": [1, 3]}, "type": b"stss"}, - ] + { + "data": {"entries": [1, 2, 3, 3], "sample_count": 4, "sample_size": 0}, + "type": b"stsz", + }, + {"data": {"entries": [1, 5]}, "type": b"co64"}, + {"data": {"entries": [1, 3]}, "type": b"stss"}, + ] + + descs, sample_iter = sample_parser.extract_raw_samples_from_stbl_data(stbl_bytes) + samples = list(sample_iter) + _verify_samples(descs, samples) def test_box_header_0_building(): diff --git a/tests/unit/test_simple_mp4_parser.py b/tests/unit/test_simple_mp4_parser.py index 19701398..5b375842 100644 --- a/tests/unit/test_simple_mp4_parser.py +++ b/tests/unit/test_simple_mp4_parser.py @@ -1,9 +1,9 @@ import io import typing -from mapillary_tools.geotag import ( +from mapillary_tools.mp4 import ( construct_mp4_parser as cparser, - simple_mp4_parser as parser, + simple_mp4_parser as sparser, ) @@ -26,7 +26,7 @@ def _parse(data: bytes): } consumed_size = 0 ret = [] - for h, _d, s in parser.parse_boxes_recursive( + for h, _d, s in sparser.parse_boxes_recursive( io.BytesIO(data), box_list_types=box_list_types ): box_data = s.read(h.maxsize) @@ -42,7 +42,7 @@ def _parse(data: bytes): def _assert_box_type( data: bytes, - parsed: typing.List[typing.Tuple[parser.Header, bytes]], + parsed: typing.List[typing.Tuple[sparser.Header, bytes]], box_type: bytes, ): assert 1 == len(parsed) @@ -55,7 +55,7 @@ def _assert_box_type( def test_parse_box_header(): s = io.BytesIO(b"hello") - header = parser.parse_box_header(s, maxsize=0) + header = sparser.parse_box_header(s, maxsize=0) assert header.header_size == 0 assert header.box_size == 0 assert header.type == b""