refactor: mp4 module (#652)

* wip * more naming refactoring * simplify Sample and RawSample * rename: import simple_mp4_parser as sparser * refactor mp4 sample parser * refactor camm parser * refactor gpmf parser * rename parse_ to extract_ * remove deprecated functions * rename composition_timedelta to composition_offset * move mp4 to a separate module * add the missing __init__.py * fix import * update setup.py
mapillary · Nov 28, 2024 · 68e5e31 · 68e5e31
1 parent b37895e
commit 68e5e31
Show file tree

Hide file tree

Showing 21 changed files with 411 additions and 506 deletions.
diff --git a/mapillary_tools/geotag/blackvue_parser.py b/mapillary_tools/geotag/blackvue_parser.py
@@ -7,7 +7,7 @@
 import pynmea2
 
 from .. import geo
-from . import simple_mp4_parser
+from ..mp4 import simple_mp4_parser as sparser
 
 
 LOG = logging.getLogger(__name__)
@@ -55,8 +55,8 @@ def _parse_gps_box(gps_data: bytes) -> T.Generator[geo.Point, None, None]:
 
 def extract_camera_model(fp: T.BinaryIO) -> str:
     try:
-        cprt_bytes = simple_mp4_parser.parse_mp4_data_first(fp, [b"free", b"cprt"])
-    except simple_mp4_parser.ParsingError:
+        cprt_bytes = sparser.parse_mp4_data_first(fp, [b"free", b"cprt"])
+    except sparser.ParsingError:
         return ""
 
     if cprt_bytes is None:
@@ -91,7 +91,7 @@ def extract_camera_model(fp: T.BinaryIO) -> str:
 
 
 def extract_points(fp: T.BinaryIO) -> T.Optional[T.List[geo.Point]]:
-    gps_data = simple_mp4_parser.parse_mp4_data_first(fp, [b"free", b"gps "])
+    gps_data = sparser.parse_mp4_data_first(fp, [b"free", b"gps "])
     if gps_data is None:
         return None
 

diff --git a/mapillary_tools/geotag/camm_builder.py b/mapillary_tools/geotag/camm_builder.py
@@ -2,11 +2,13 @@
 import typing as T
 
 from .. import geo, types
+from ..mp4 import (
+    construct_mp4_parser as cparser,
+    mp4_sample_parser as sample_parser,
+)
 
 from . import (
     camm_parser,
-    construct_mp4_parser as cparser,
-    mp4_sample_parser as sample_parser,
     simple_mp4_builder as builder,
 )
 from .simple_mp4_builder import BoxDict

diff --git a/mapillary_tools/geotag/camm_parser.py b/mapillary_tools/geotag/camm_parser.py
@@ -9,12 +9,8 @@
 
 import construct as C
 
-from . import (
-    construct_mp4_parser as cparser,
-    geo,
-    mp4_sample_parser as sample_parser,
-    simple_mp4_parser as parser,
-)
+from . import geo
+from ..mp4 import simple_mp4_parser as sparser, mp4_sample_parser as sample_parser
 
 
 LOG = logging.getLogger(__name__)
@@ -82,12 +78,12 @@ class CAMMType(Enum):
 def _parse_point_from_sample(
     fp: T.BinaryIO, sample: sample_parser.Sample
 ) -> T.Optional[geo.Point]:
-    fp.seek(sample.offset, io.SEEK_SET)
-    data = fp.read(sample.size)
+    fp.seek(sample.raw_sample.offset, io.SEEK_SET)
+    data = fp.read(sample.raw_sample.size)
     box = CAMMSampleData.parse(data)
     if box.type == CAMMType.MIN_GPS.value:
         return geo.Point(
-            time=sample.time_offset,
+            time=sample.exact_time,
             lat=box.data[0],
             lon=box.data[1],
             alt=box.data[2],
@@ -97,7 +93,7 @@ def _parse_point_from_sample(
         # Not using box.data.time_gps_epoch as the point timestamp
         # because it is from another clock
         return geo.Point(
-            time=sample.time_offset,
+            time=sample.exact_time,
             lat=box.data.latitude,
             lon=box.data.longitude,
             alt=box.data.altitude,
@@ -148,15 +144,8 @@ def elst_entry_to_seconds(
     return (media_time, duration)
 
 
-def _extract_camm_samples(
-    s: T.BinaryIO,
-    maxsize: int = -1,
-) -> T.Generator[sample_parser.Sample, None, None]:
-    samples = sample_parser.parse_samples_from_trak(s, maxsize=maxsize)
-    camm_samples = (
-        sample for sample in samples if sample.description["format"] == b"camm"
-    )
-    yield from camm_samples
+def _is_camm_description(description: T.Dict) -> bool:
+    return description["format"] == b"camm"
 
 
 def extract_points(fp: T.BinaryIO) -> T.Optional[T.List[geo.Point]]:
@@ -166,59 +155,37 @@ def extract_points(fp: T.BinaryIO) -> T.Optional[T.List[geo.Point]]:
     """
 
     points = None
-    movie_timescale = None
-    media_timescale = None
-    elst_entries = None
 
-    for h, s in parser.parse_path(fp, [b"moov", [b"mvhd", b"trak"]]):
-        if h.type == b"trak":
-            trak_start_offset = s.tell()
-
-            descriptions = sample_parser.parse_descriptions_from_trak(
-                s, maxsize=h.maxsize
+    moov = sample_parser.MovieBoxParser.parse_stream(fp)
+    for track in moov.extract_tracks():
+        descriptions = track.extract_sample_descriptions()
+        if any(_is_camm_description(d) for d in descriptions):
+            maybe_points = (
+                _parse_point_from_sample(fp, sample)
+                for sample in track.extract_samples()
+                if _is_camm_description(sample.description)
             )
-            camm_descriptions = [d for d in descriptions if d["format"] == b"camm"]
-            if camm_descriptions:
-                s.seek(trak_start_offset, io.SEEK_SET)
-                camm_samples = _extract_camm_samples(s, h.maxsize)
-
-                points_with_nones = (
-                    _parse_point_from_sample(fp, sample)
-                    for sample in camm_samples
-                    if sample.description["format"] == b"camm"
-                )
-
-                points = [p for p in points_with_nones if p is not None]
-                if points:
-                    s.seek(trak_start_offset)
-                    elst_data = parser.parse_box_data_first(
-                        s, [b"edts", b"elst"], maxsize=h.maxsize
-                    )
-                    if elst_data is not None:
-                        elst_entries = cparser.EditBox.parse(elst_data)["entries"]
-
-                    s.seek(trak_start_offset)
-                    mdhd_data = parser.parse_box_data_firstx(
-                        s, [b"mdia", b"mdhd"], maxsize=h.maxsize
-                    )
-                    mdhd = cparser.MediaHeaderBox.parse(mdhd_data)
-                    media_timescale = mdhd["timescale"]
-        else:
-            assert h.type == b"mvhd"
-            if not movie_timescale:
-                mvhd = cparser.MovieHeaderBox.parse(s.read(h.maxsize))
-                movie_timescale = mvhd["timescale"]
-
-        # exit when both found
-        if movie_timescale is not None and points:
-            break
-
-    if points and movie_timescale and media_timescale and elst_entries:
-        segments = [
-            elst_entry_to_seconds(entry, movie_timescale, media_timescale)
-            for entry in elst_entries
-        ]
-        points = list(filter_points_by_elst(points, segments))
+            points = [p for p in maybe_points if p is not None]
+            if points:
+                elst_boxdata = track.extract_elst_boxdata()
+                if elst_boxdata is not None:
+                    elst_entries = elst_boxdata["entries"]
+                    if elst_entries:
+                        # media_timescale
+                        mdhd_boxdata = track.extract_mdhd_boxdata()
+                        media_timescale = mdhd_boxdata["timescale"]
+                        # movie_timescale
+                        mvhd_boxdata = moov.extract_mvhd_boxdata()
+                        movie_timescale = mvhd_boxdata["timescale"]
+                        segments = [
+                            elst_entry_to_seconds(
+                                entry,
+                                movie_timescale=movie_timescale,
+                                media_timescale=media_timescale,
+                            )
+                            for entry in elst_entries
+                        ]
+                        points = list(filter_points_by_elst(points, segments))
 
     return points
 
@@ -238,15 +205,15 @@ def parse_gpx(path: pathlib.Path) -> T.List[geo.Point]:
 )
 
 
-def _decode_quietly(data: bytes, h: parser.Header) -> str:
+def _decode_quietly(data: bytes, h: sparser.Header) -> str:
     try:
         return data.decode("utf-8")
     except UnicodeDecodeError:
         LOG.warning("Failed to decode %s: %s", h, data[:512])
         return ""
 
 
-def _parse_quietly(data: bytes, h: parser.Header) -> bytes:
+def _parse_quietly(data: bytes, h: sparser.Header) -> bytes:
     try:
         parsed = MakeOrModel.parse(data)
     except C.ConstructError:
@@ -256,7 +223,7 @@ def _parse_quietly(data: bytes, h: parser.Header) -> bytes:
 
 
 def extract_camera_make_and_model(fp: T.BinaryIO) -> T.Tuple[str, str]:
-    header_and_stream = parser.parse_path(
+    header_and_stream = sparser.parse_path(
         fp,
         [
             b"moov",
@@ -296,7 +263,7 @@ def extract_camera_make_and_model(fp: T.BinaryIO) -> T.Tuple[str, str]:
             # quit when both found
             if make and model:
                 break
-    except parser.ParsingError:
+    except sparser.ParsingError:
         pass
 
     if make:

diff --git a/mapillary_tools/geotag/geotag_videos_from_video.py b/mapillary_tools/geotag/geotag_videos_from_video.py
@@ -12,9 +12,9 @@
     camm_parser,
     gpmf_gps_filter,
     gpmf_parser,
-    simple_mp4_parser as parser,
     utils as video_utils,
 )
+from ..mp4 import simple_mp4_parser as sparser
 from .geotag_from_generic import GeotagVideosFromGeneric
 
 LOG = logging.getLogger(__name__)
@@ -77,7 +77,7 @@ def _extract_video_metadata(
             with video_path.open("rb") as fp:
                 try:
                     points = camm_parser.extract_points(fp)
-                except parser.ParsingError:
+                except sparser.ParsingError:
                     points = None
 
                 if points is not None:
@@ -100,7 +100,7 @@ def _extract_video_metadata(
             with video_path.open("rb") as fp:
                 try:
                     points_with_fix = gpmf_parser.extract_points(fp)
-                except parser.ParsingError:
+                except sparser.ParsingError:
                     points_with_fix = None
 
                 if points_with_fix is not None:
@@ -123,7 +123,7 @@ def _extract_video_metadata(
             with video_path.open("rb") as fp:
                 try:
                     points = blackvue_parser.extract_points(fp)
-                except parser.ParsingError:
+                except sparser.ParsingError:
                     points = None
 
                 if points is not None: