diff --git a/mapillary_tools/geotag/blackvue_parser.py b/mapillary_tools/geotag/blackvue_parser.py
index a34d5327..99fc92ba 100644
--- a/mapillary_tools/geotag/blackvue_parser.py
+++ b/mapillary_tools/geotag/blackvue_parser.py
@@ -7,7 +7,7 @@
 import pynmea2
 
 from .. import geo
-from . import simple_mp4_parser
+from ..mp4 import simple_mp4_parser as sparser
 
 
 LOG = logging.getLogger(__name__)
@@ -55,8 +55,8 @@ def _parse_gps_box(gps_data: bytes) -> T.Generator[geo.Point, None, None]:
 
 def extract_camera_model(fp: T.BinaryIO) -> str:
     try:
-        cprt_bytes = simple_mp4_parser.parse_mp4_data_first(fp, [b"free", b"cprt"])
-    except simple_mp4_parser.ParsingError:
+        cprt_bytes = sparser.parse_mp4_data_first(fp, [b"free", b"cprt"])
+    except sparser.ParsingError:
         return ""
 
     if cprt_bytes is None:
@@ -91,7 +91,7 @@ def extract_camera_model(fp: T.BinaryIO) -> str:
 
 
 def extract_points(fp: T.BinaryIO) -> T.Optional[T.List[geo.Point]]:
-    gps_data = simple_mp4_parser.parse_mp4_data_first(fp, [b"free", b"gps "])
+    gps_data = sparser.parse_mp4_data_first(fp, [b"free", b"gps "])
     if gps_data is None:
         return None
 
diff --git a/mapillary_tools/geotag/camm_builder.py b/mapillary_tools/geotag/camm_builder.py
index 5ff61e35..5a013f57 100644
--- a/mapillary_tools/geotag/camm_builder.py
+++ b/mapillary_tools/geotag/camm_builder.py
@@ -2,11 +2,13 @@
 import typing as T
 
 from .. import geo, types
+from ..mp4 import (
+    construct_mp4_parser as cparser,
+    mp4_sample_parser as sample_parser,
+)
 
 from . import (
     camm_parser,
-    construct_mp4_parser as cparser,
-    mp4_sample_parser as sample_parser,
     simple_mp4_builder as builder,
 )
 from .simple_mp4_builder import BoxDict
diff --git a/mapillary_tools/geotag/camm_parser.py b/mapillary_tools/geotag/camm_parser.py
index 994769d4..f93b7ffd 100644
--- a/mapillary_tools/geotag/camm_parser.py
+++ b/mapillary_tools/geotag/camm_parser.py
@@ -9,12 +9,8 @@
 
 import construct as C
 
-from . import (
-    construct_mp4_parser as cparser,
-    geo,
-    mp4_sample_parser as sample_parser,
-    simple_mp4_parser as parser,
-)
+from . import geo
+from ..mp4 import simple_mp4_parser as sparser, mp4_sample_parser as sample_parser
 
 
 LOG = logging.getLogger(__name__)
@@ -82,12 +78,12 @@ class CAMMType(Enum):
 def _parse_point_from_sample(
     fp: T.BinaryIO, sample: sample_parser.Sample
 ) -> T.Optional[geo.Point]:
-    fp.seek(sample.offset, io.SEEK_SET)
-    data = fp.read(sample.size)
+    fp.seek(sample.raw_sample.offset, io.SEEK_SET)
+    data = fp.read(sample.raw_sample.size)
     box = CAMMSampleData.parse(data)
     if box.type == CAMMType.MIN_GPS.value:
         return geo.Point(
-            time=sample.time_offset,
+            time=sample.exact_time,
             lat=box.data[0],
             lon=box.data[1],
             alt=box.data[2],
@@ -97,7 +93,7 @@ def _parse_point_from_sample(
         # Not using box.data.time_gps_epoch as the point timestamp
         # because it is from another clock
         return geo.Point(
-            time=sample.time_offset,
+            time=sample.exact_time,
             lat=box.data.latitude,
             lon=box.data.longitude,
             alt=box.data.altitude,
@@ -148,15 +144,8 @@ def elst_entry_to_seconds(
     return (media_time, duration)
 
 
-def _extract_camm_samples(
-    s: T.BinaryIO,
-    maxsize: int = -1,
-) -> T.Generator[sample_parser.Sample, None, None]:
-    samples = sample_parser.parse_samples_from_trak(s, maxsize=maxsize)
-    camm_samples = (
-        sample for sample in samples if sample.description["format"] == b"camm"
-    )
-    yield from camm_samples
+def _is_camm_description(description: T.Dict) -> bool:
+    return description["format"] == b"camm"
 
 
 def extract_points(fp: T.BinaryIO) -> T.Optional[T.List[geo.Point]]:
@@ -166,59 +155,37 @@ def extract_points(fp: T.BinaryIO) -> T.Optional[T.List[geo.Point]]:
     """
 
     points = None
-    movie_timescale = None
-    media_timescale = None
-    elst_entries = None
 
-    for h, s in parser.parse_path(fp, [b"moov", [b"mvhd", b"trak"]]):
-        if h.type == b"trak":
-            trak_start_offset = s.tell()
-
-            descriptions = sample_parser.parse_descriptions_from_trak(
-                s, maxsize=h.maxsize
+    moov = sample_parser.MovieBoxParser.parse_stream(fp)
+    for track in moov.extract_tracks():
+        descriptions = track.extract_sample_descriptions()
+        if any(_is_camm_description(d) for d in descriptions):
+            maybe_points = (
+                _parse_point_from_sample(fp, sample)
+                for sample in track.extract_samples()
+                if _is_camm_description(sample.description)
             )
-            camm_descriptions = [d for d in descriptions if d["format"] == b"camm"]
-            if camm_descriptions:
-                s.seek(trak_start_offset, io.SEEK_SET)
-                camm_samples = _extract_camm_samples(s, h.maxsize)
-
-                points_with_nones = (
-                    _parse_point_from_sample(fp, sample)
-                    for sample in camm_samples
-                    if sample.description["format"] == b"camm"
-                )
-
-                points = [p for p in points_with_nones if p is not None]
-                if points:
-                    s.seek(trak_start_offset)
-                    elst_data = parser.parse_box_data_first(
-                        s, [b"edts", b"elst"], maxsize=h.maxsize
-                    )
-                    if elst_data is not None:
-                        elst_entries = cparser.EditBox.parse(elst_data)["entries"]
-
-                    s.seek(trak_start_offset)
-                    mdhd_data = parser.parse_box_data_firstx(
-                        s, [b"mdia", b"mdhd"], maxsize=h.maxsize
-                    )
-                    mdhd = cparser.MediaHeaderBox.parse(mdhd_data)
-                    media_timescale = mdhd["timescale"]
-        else:
-            assert h.type == b"mvhd"
-            if not movie_timescale:
-                mvhd = cparser.MovieHeaderBox.parse(s.read(h.maxsize))
-                movie_timescale = mvhd["timescale"]
-
-        # exit when both found
-        if movie_timescale is not None and points:
-            break
-
-    if points and movie_timescale and media_timescale and elst_entries:
-        segments = [
-            elst_entry_to_seconds(entry, movie_timescale, media_timescale)
-            for entry in elst_entries
-        ]
-        points = list(filter_points_by_elst(points, segments))
+            points = [p for p in maybe_points if p is not None]
+            if points:
+                elst_boxdata = track.extract_elst_boxdata()
+                if elst_boxdata is not None:
+                    elst_entries = elst_boxdata["entries"]
+                    if elst_entries:
+                        # media_timescale
+                        mdhd_boxdata = track.extract_mdhd_boxdata()
+                        media_timescale = mdhd_boxdata["timescale"]
+                        # movie_timescale
+                        mvhd_boxdata = moov.extract_mvhd_boxdata()
+                        movie_timescale = mvhd_boxdata["timescale"]
+                        segments = [
+                            elst_entry_to_seconds(
+                                entry,
+                                movie_timescale=movie_timescale,
+                                media_timescale=media_timescale,
+                            )
+                            for entry in elst_entries
+                        ]
+                        points = list(filter_points_by_elst(points, segments))
 
     return points
 
@@ -238,7 +205,7 @@ def parse_gpx(path: pathlib.Path) -> T.List[geo.Point]:
 )
 
 
-def _decode_quietly(data: bytes, h: parser.Header) -> str:
+def _decode_quietly(data: bytes, h: sparser.Header) -> str:
     try:
         return data.decode("utf-8")
     except UnicodeDecodeError:
@@ -246,7 +213,7 @@ def _decode_quietly(data: bytes, h: parser.Header) -> str:
         return ""
 
 
-def _parse_quietly(data: bytes, h: parser.Header) -> bytes:
+def _parse_quietly(data: bytes, h: sparser.Header) -> bytes:
     try:
         parsed = MakeOrModel.parse(data)
     except C.ConstructError:
@@ -256,7 +223,7 @@ def _parse_quietly(data: bytes, h: parser.Header) -> bytes:
 
 
 def extract_camera_make_and_model(fp: T.BinaryIO) -> T.Tuple[str, str]:
-    header_and_stream = parser.parse_path(
+    header_and_stream = sparser.parse_path(
         fp,
         [
             b"moov",
@@ -296,7 +263,7 @@ def extract_camera_make_and_model(fp: T.BinaryIO) -> T.Tuple[str, str]:
             # quit when both found
             if make and model:
                 break
-    except parser.ParsingError:
+    except sparser.ParsingError:
         pass
 
     if make:
diff --git a/mapillary_tools/geotag/geotag_videos_from_video.py b/mapillary_tools/geotag/geotag_videos_from_video.py
index 77be8c6f..d1d31c0d 100644
--- a/mapillary_tools/geotag/geotag_videos_from_video.py
+++ b/mapillary_tools/geotag/geotag_videos_from_video.py
@@ -12,9 +12,9 @@
     camm_parser,
     gpmf_gps_filter,
     gpmf_parser,
-    simple_mp4_parser as parser,
     utils as video_utils,
 )
+from ..mp4 import simple_mp4_parser as sparser
 from .geotag_from_generic import GeotagVideosFromGeneric
 
 LOG = logging.getLogger(__name__)
@@ -77,7 +77,7 @@ def _extract_video_metadata(
             with video_path.open("rb") as fp:
                 try:
                     points = camm_parser.extract_points(fp)
-                except parser.ParsingError:
+                except sparser.ParsingError:
                     points = None
 
                 if points is not None:
@@ -100,7 +100,7 @@ def _extract_video_metadata(
             with video_path.open("rb") as fp:
                 try:
                     points_with_fix = gpmf_parser.extract_points(fp)
-                except parser.ParsingError:
+                except sparser.ParsingError:
                     points_with_fix = None
 
                 if points_with_fix is not None:
@@ -123,7 +123,7 @@ def _extract_video_metadata(
             with video_path.open("rb") as fp:
                 try:
                     points = blackvue_parser.extract_points(fp)
-                except parser.ParsingError:
+                except sparser.ParsingError:
                     points = None
 
                 if points is not None:
diff --git a/mapillary_tools/geotag/gpmf_parser.py b/mapillary_tools/geotag/gpmf_parser.py
index c5b4945b..7feaf713 100644
--- a/mapillary_tools/geotag/gpmf_parser.py
+++ b/mapillary_tools/geotag/gpmf_parser.py
@@ -5,7 +5,7 @@
 import construct as C
 
 from .. import geo
-from . import mp4_sample_parser as sample_parser, simple_mp4_parser as parser
+from ..mp4 import mp4_sample_parser as sample_parser
 
 """
 Parsing GPS from GPMF data format stored in GoPros. See the GPMF spec: https://github.com/gopro/gpmf-parser
@@ -304,8 +304,8 @@ def _extract_dvnm_from_samples(
     dvnm_by_dvid: T.Dict[int, bytes] = {}
 
     for sample in samples:
-        fp.seek(sample.offset, io.SEEK_SET)
-        data = fp.read(sample.size)
+        fp.seek(sample.raw_sample.offset, io.SEEK_SET)
+        data = fp.read(sample.raw_sample.size)
         gpmf_sample_data = T.cast(T.Dict, GPMFSampleData.parse(data))
 
         # iterate devices
@@ -328,8 +328,8 @@ def _extract_points_from_samples(
     points_by_dvid: T.Dict[int, T.List[geo.PointWithFix]] = {}
 
     for sample in samples:
-        fp.seek(sample.offset, io.SEEK_SET)
-        data = fp.read(sample.size)
+        fp.seek(sample.raw_sample.offset, io.SEEK_SET)
+        data = fp.read(sample.raw_sample.size)
         gpmf_sample_data = T.cast(T.Dict, GPMFSampleData.parse(data))
 
         # iterate devices
@@ -338,9 +338,9 @@ def _extract_points_from_samples(
             sample_points = _find_first_gps_stream(device["data"])
             if sample_points:
                 # interpolate timestamps in between
-                avg_timedelta = sample.timedelta / len(sample_points)
+                avg_timedelta = sample.exact_timedelta / len(sample_points)
                 for idx, point in enumerate(sample_points):
-                    point.time = sample.time_offset + avg_timedelta * idx
+                    point.time = sample.exact_time + avg_timedelta * idx
 
                 device_id = _find_first_device_id(device["data"])
                 device_points = points_by_dvid.setdefault(device_id, [])
@@ -350,18 +350,25 @@ def _extract_points_from_samples(
     return values[0] if values else []
 
 
+def _is_gpmd_description(description: T.Dict) -> bool:
+    return description["format"] == b"gpmd"
+
+
 def extract_points(fp: T.BinaryIO) -> T.Optional[T.List[geo.PointWithFix]]:
     """
     Return a list of points (could be empty) if it is a valid GoPro video,
     otherwise None
     """
     points = None
-    for h, s in parser.parse_path(fp, [b"moov", b"trak"]):
-        trak_start_offset = s.tell()
-        descriptions = _extract_gpmd_descriptions_from_trak(s, h.maxsize)
-        if descriptions:
-            s.seek(trak_start_offset, io.SEEK_SET)
-            gpmd_samples = _extract_gpmd_samples_from_trak(s, h.maxsize)
+    moov = sample_parser.MovieBoxParser.parse_stream(fp)
+    for track in moov.extract_tracks():
+        descriptions = track.extract_sample_descriptions()
+        if any(_is_gpmd_description(d) for d in descriptions):
+            gpmd_samples = (
+                sample
+                for sample in track.extract_samples()
+                if _is_gpmd_description(sample.description)
+            )
             points = list(_extract_points_from_samples(fp, gpmd_samples))
             # return the firstly found non-empty points
             if points:
@@ -370,35 +377,19 @@ def extract_points(fp: T.BinaryIO) -> T.Optional[T.List[geo.PointWithFix]]:
     return points
 
 
-def _extract_gpmd_descriptions_from_trak(
-    s: T.BinaryIO,
-    maxsize: int = -1,
-):
-    descriptions = sample_parser.parse_descriptions_from_trak(s, maxsize=maxsize)
-    return [d for d in descriptions if d["format"] == b"gpmd"]
-
-
-def _extract_gpmd_samples_from_trak(
-    s: T.BinaryIO,
-    maxsize: int = -1,
-) -> T.Generator[sample_parser.Sample, None, None]:
-    trak_start_offset = s.tell()
-    gpmd_descriptions = _extract_gpmd_descriptions_from_trak(s, maxsize=maxsize)
-    if gpmd_descriptions:
-        s.seek(trak_start_offset, io.SEEK_SET)
-        samples = sample_parser.parse_samples_from_trak(s, maxsize=maxsize)
-        gpmd_samples = (
-            sample for sample in samples if sample.description["format"] == b"gpmd"
-        )
-        yield from gpmd_samples
-
-
 def extract_all_device_names(fp: T.BinaryIO) -> T.Dict[int, bytes]:
-    for h, s in parser.parse_path(fp, [b"moov", b"trak"]):
-        gpmd_samples = _extract_gpmd_samples_from_trak(s, h.maxsize)
-        device_names = _extract_dvnm_from_samples(fp, gpmd_samples)
-        if device_names:
-            return device_names
+    moov = sample_parser.MovieBoxParser.parse_stream(fp)
+    for track in moov.extract_tracks():
+        descriptions = track.extract_sample_descriptions()
+        if any(_is_gpmd_description(d) for d in descriptions):
+            gpmd_samples = (
+                sample
+                for sample in track.extract_samples()
+                if _is_gpmd_description(sample.description)
+            )
+            device_names = _extract_dvnm_from_samples(fp, gpmd_samples)
+            if device_names:
+                return device_names
     return {}
 
 
@@ -439,12 +430,3 @@ def parse_gpx(path: pathlib.Path) -> T.List[geo.PointWithFix]:
     if points is None:
         return []
     return points
-
-
-def iterate_gpmd_sample_data(fp: T.BinaryIO) -> T.Generator[T.Dict, None, None]:
-    for h, s in parser.parse_path(fp, [b"moov", b"trak"]):
-        gpmd_samples = _extract_gpmd_samples_from_trak(s, h.maxsize)
-        for sample in gpmd_samples:
-            fp.seek(sample.offset, io.SEEK_SET)
-            data = fp.read(sample.size)
-            yield T.cast(T.Dict, GPMFSampleData.parse(data))
diff --git a/mapillary_tools/geotag/simple_mp4_builder.py b/mapillary_tools/geotag/simple_mp4_builder.py
index a718c6a9..6946f310 100644
--- a/mapillary_tools/geotag/simple_mp4_builder.py
+++ b/mapillary_tools/geotag/simple_mp4_builder.py
@@ -2,14 +2,25 @@
 import io
 import typing as T
 
-from . import (
+from . import io_utils
+from ..mp4 import (
     construct_mp4_parser as cparser,
-    io_utils,
     mp4_sample_parser as sample_parser,
-    simple_mp4_parser as parser,
+    simple_mp4_parser as sparser,
 )
-from .construct_mp4_parser import BoxDict
-from .mp4_sample_parser import RawSample
+from ..mp4.construct_mp4_parser import BoxDict
+from ..mp4.mp4_sample_parser import RawSample
+
+"""
+Variable naming conventions:
+
+- *_box: a BoxDict
+- *_children: a list of child BoxDicts under the parent box
+- *_boxdata: BoxDict["data"]
+- *_data: the data in bytes of a box (without the header (type and size))
+- *_typed_data: the data in bytes of a box (with the header (type and size))
+"""
+
 
 UINT32_MAX = 2**32 - 1
 UINT64_MAX = 2**64 - 1
@@ -128,6 +139,7 @@ def _build_stts(sample_deltas: T.Iterable[int]) -> BoxDict:
 class _CompressedSampleCompositionOffset:
     __slots__ = ("sample_count", "sample_offset")
     # make sure dataclasses.asdict() produce the result as CompositionTimeToSampleBox expects
+    # SO DO NOT RENAME THE PROPERTIES BELOW
     sample_count: int
     sample_offset: int
 
@@ -225,7 +237,7 @@ def _update_all_trak_tkhd(moov_chilren: T.Sequence[BoxDict]) -> None:
 )
 
 
-def _update_sbtl(trak: BoxDict, sample_offset: int) -> int:
+def _update_sbtl_sample_offsets(trak: BoxDict, sample_offset: int) -> int:
     assert trak["type"] == b"trak"
 
     # new samples with offsets updated
@@ -243,14 +255,13 @@ def _update_sbtl(trak: BoxDict, sample_offset: int) -> int:
         )
         sample_offset += sample.size
     stbl_box = cparser.find_box_at_pathx(trak, [b"trak", b"mdia", b"minf", b"stbl"])
-    descriptions, _ = sample_parser.parse_raw_samples_from_stbl(
-        io.BytesIO(T.cast(bytes, stbl_box["data"]))
+    descriptions, _ = sample_parser.extract_raw_samples_from_stbl_data(
+        T.cast(bytes, stbl_box["data"])
     )
     stbl_children_boxes = build_stbl_from_raw_samples(
         descriptions, repositioned_samples
     )
-    new_stbl_bytes = _STBLChildrenBuilderConstruct.build_boxlist(stbl_children_boxes)
-    stbl_box["data"] = new_stbl_bytes
+    stbl_box["data"] = _STBLChildrenBuilderConstruct.build_boxlist(stbl_children_boxes)
 
     return sample_offset
 
@@ -263,13 +274,13 @@ def iterate_samples(
             stbl_box = cparser.find_box_at_pathx(
                 box, [b"trak", b"mdia", b"minf", b"stbl"]
             )
-            _, raw_samples_iter = sample_parser.parse_raw_samples_from_stbl(
-                io.BytesIO(T.cast(bytes, stbl_box["data"]))
+            _, raw_samples_iter = sample_parser.extract_raw_samples_from_stbl_data(
+                T.cast(bytes, stbl_box["data"])
             )
             yield from raw_samples_iter
 
 
-def _build_mdat_header_bytes(mdat_size: int) -> bytes:
+def _build_mdat_header_data(mdat_size: int) -> bytes:
     if UINT32_MAX < mdat_size + 8:
         return cparser.BoxHeader64.build(
             {
@@ -302,7 +313,7 @@ def find_movie_timescale(moov_children: T.Sequence[BoxDict]) -> int:
     return T.cast(T.Dict, mvhd["data"])["timescale"]
 
 
-def _build_moov_bytes(moov_children: T.Sequence[BoxDict]) -> bytes:
+def _build_moov_typed_data(moov_children: T.Sequence[BoxDict]) -> bytes:
     return cparser.MP4WithoutSTBLBuilderConstruct.build_box(
         {
             "type": b"moov",
@@ -324,62 +335,77 @@ def transform_mp4(
 ) -> io_utils.ChainedIO:
     # extract ftyp
     src_fp.seek(0)
-    source_ftyp_box_data = parser.parse_mp4_data_firstx(src_fp, [b"ftyp"])
-    source_ftyp_data = cparser.MP4WithoutSTBLBuilderConstruct.build_box(
-        {"type": b"ftyp", "data": source_ftyp_box_data}
-    )
+    ftyp_data = sparser.parse_mp4_data_firstx(src_fp, [b"ftyp"])
 
     # extract moov
     src_fp.seek(0)
-    src_moov_data = parser.parse_mp4_data_firstx(src_fp, [b"moov"])
-    moov_children = _MOOVChildrenParserConstruct.parse_boxlist(src_moov_data)
+    moov_data = sparser.parse_mp4_data_firstx(src_fp, [b"moov"])
+    moov_children = _MOOVChildrenParserConstruct.parse_boxlist(moov_data)
 
     # filter tracks in moov
     moov_children = list(_filter_moov_children_boxes(moov_children))
 
     # extract video samples
     source_samples = list(iterate_samples(moov_children))
-    movie_sample_readers = [
+    sample_readers: T.List[io.IOBase] = [
         io_utils.SlicedIO(src_fp, sample.offset, sample.size)
         for sample in source_samples
     ]
     if sample_generator is not None:
-        sample_readers = list(sample_generator(src_fp, moov_children))
-    else:
-        sample_readers = []
+        sample_readers.extend(sample_generator(src_fp, moov_children))
 
     _update_all_trak_tkhd(moov_children)
 
-    # moov_boxes should be immutable since here
+    return build_mp4(ftyp_data, moov_children, sample_readers)
+
+
+def build_mp4(
+    ftyp_data: bytes,
+    moov_children: T.Sequence[BoxDict],
+    sample_readers: T.Iterable[io.IOBase],
+) -> io_utils.ChainedIO:
+    ftyp_typed_data = cparser.MP4WithoutSTBLBuilderConstruct.build_box(
+        {"type": b"ftyp", "data": ftyp_data}
+    )
     mdat_body_size = sum(sample.size for sample in iterate_samples(moov_children))
+    # moov_children should be immutable since here
+    new_moov_typed_data = _rewrite_and_build_moov_typed_data(
+        len(ftyp_typed_data), moov_children
+    )
     return io_utils.ChainedIO(
         [
-            io.BytesIO(source_ftyp_data),
-            io.BytesIO(_rewrite_moov(len(source_ftyp_data), moov_children)),
-            io.BytesIO(_build_mdat_header_bytes(mdat_body_size)),
-            *movie_sample_readers,
+            # ftyp
+            io.BytesIO(ftyp_typed_data),
+            # moov
+            io.BytesIO(new_moov_typed_data),
+            # mdat
+            io.BytesIO(_build_mdat_header_data(mdat_body_size)),
             *sample_readers,
         ]
     )
 
 
-def _rewrite_moov(moov_offset: int, moov_boxes: T.Sequence[BoxDict]) -> bytes:
+def _rewrite_and_build_moov_typed_data(
+    moov_offset: int, moov_children: T.Sequence[BoxDict]
+) -> bytes:
     # build moov for calculating moov size
     sample_offset = 0
-    for box in _filter_trak_boxes(moov_boxes):
-        sample_offset = _update_sbtl(box, sample_offset)
-    moov_data = _build_moov_bytes(moov_boxes)
-    moov_data_size = len(moov_data)
+    for box in _filter_trak_boxes(moov_children):
+        sample_offset = _update_sbtl_sample_offsets(box, sample_offset)
+    moov_typed_data = _build_moov_typed_data(moov_children)
+    moov_typed_data_size = len(moov_typed_data)
 
     # mdat header size
-    mdat_body_size = sum(sample.size for sample in iterate_samples(moov_boxes))
-    mdat_header = _build_mdat_header_bytes(mdat_body_size)
+    mdat_body_size = sum(sample.size for sample in iterate_samples(moov_children))
+    mdat_header_data = _build_mdat_header_data(mdat_body_size)
 
     # build moov for real
-    sample_offset = moov_offset + len(moov_data) + len(mdat_header)
-    for box in _filter_trak_boxes(moov_boxes):
-        sample_offset = _update_sbtl(box, sample_offset)
-    moov_data = _build_moov_bytes(moov_boxes)
-    assert len(moov_data) == moov_data_size, f"{len(moov_data)} != {moov_data_size}"
-
-    return moov_data
+    sample_offset = moov_offset + len(moov_typed_data) + len(mdat_header_data)
+    for box in _filter_trak_boxes(moov_children):
+        sample_offset = _update_sbtl_sample_offsets(box, sample_offset)
+    moov_typed_data = _build_moov_typed_data(moov_children)
+    assert (
+        len(moov_typed_data) == moov_typed_data_size
+    ), f"{len(moov_typed_data)} != {moov_typed_data_size}"
+
+    return moov_typed_data
diff --git a/mapillary_tools/mp4/__init__.py b/mapillary_tools/mp4/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/mapillary_tools/geotag/construct_mp4_parser.py b/mapillary_tools/mp4/construct_mp4_parser.py
similarity index 94%
rename from mapillary_tools/geotag/construct_mp4_parser.py
rename to mapillary_tools/mp4/construct_mp4_parser.py
index f8b47c06..8ca1454b 100644
--- a/mapillary_tools/geotag/construct_mp4_parser.py
+++ b/mapillary_tools/mp4/construct_mp4_parser.py
@@ -441,12 +441,6 @@ def parse_box(self, data: bytes) -> BoxDict:
     def parse_boxlist(self, data: bytes) -> T.List[BoxDict]:
         return T.cast(T.List[BoxDict], self.BoxList.parse(data))
 
-    def build_box(self, box: BoxDict) -> bytes:
-        return self.Box.build(box)
-
-    def build_boxlist(self, boxes: T.Sequence[BoxDict]) -> bytes:
-        return self.BoxList.build(boxes)
-
 
 class Box32ConstructBuilder(Box64ConstructBuilder):
     """
@@ -467,6 +461,18 @@ def Box(self) -> C.Construct:
 
         return self._box
 
+    def parse_box(self, data: bytes) -> BoxDict:
+        raise NotImplementedError("Box32ConstructBuilder does not support parsing")
+
+    def parse_boxlist(self, data: bytes) -> T.List[BoxDict]:
+        raise NotImplementedError("Box32ConstructBuilder does not support parsing")
+
+    def build_box(self, box: BoxDict) -> bytes:
+        return self.Box.build(box)
+
+    def build_boxlist(self, boxes: T.Sequence[BoxDict]) -> bytes:
+        return self.BoxList.build(boxes)
+
 
 # pyre-ignore[9]: pyre does not support recursive type SwitchMapType
 CMAP: SwitchMapType = {
@@ -580,8 +586,17 @@ def _new_cmap_without_boxes(
 def find_box_at_pathx(
     box: T.Union[T.Sequence[BoxDict], BoxDict], path: T.Sequence[bytes]
 ) -> BoxDict:
-    if not path:
+    found = find_box_at_path(box, path)
+    if found is None:
         raise ValueError(f"box at path {path} not found")
+    return found
+
+
+def find_box_at_path(
+    box: T.Union[T.Sequence[BoxDict], BoxDict], path: T.Sequence[bytes]
+) -> T.Optional[BoxDict]:
+    if not path:
+        return None
 
     boxes: T.Sequence[BoxDict]
     if isinstance(box, dict):
@@ -593,12 +608,13 @@ def find_box_at_pathx(
         if box["type"] == path[0]:
             if len(path) == 1:
                 return box
-            else:
-                box_data = T.cast(T.Sequence[BoxDict], box["data"])
-                # ListContainer from construct is not sequence
-                assert isinstance(
-                    box_data, T.Sequence
-                ), f"expect a list of boxes but got {type(box_data)} at path {path}"
-                return find_box_at_pathx(box_data, path[1:])
-
-    raise ValueError(f"box at path {path} not found")
+            box_data = T.cast(T.Sequence[BoxDict], box["data"])
+            # ListContainer from construct is not sequence
+            assert isinstance(
+                box_data, T.Sequence
+            ), f"expect a list of boxes but got {type(box_data)} at path {path}"
+            found = find_box_at_path(box_data, path[1:])
+            if found is not None:
+                return found
+
+    return None
diff --git a/mapillary_tools/geotag/mp4_sample_parser.py b/mapillary_tools/mp4/mp4_sample_parser.py
similarity index 50%
rename from mapillary_tools/geotag/mp4_sample_parser.py
rename to mapillary_tools/mp4/mp4_sample_parser.py
index 4c90e0a3..1cebd682 100644
--- a/mapillary_tools/geotag/mp4_sample_parser.py
+++ b/mapillary_tools/mp4/mp4_sample_parser.py
@@ -1,55 +1,46 @@
 import datetime
-import io
 import typing as T
 from pathlib import Path
 
-from . import construct_mp4_parser as cparser, simple_mp4_parser as parser
+from . import construct_mp4_parser as cparser, simple_mp4_parser as sparser
 
 
 class RawSample(T.NamedTuple):
     # 1-based index
     description_idx: int
-    # sample offset
+
+    # sample offset (offset from the beginning of the file)
     offset: int
-    # sample size
+
+    # sample size (in bytes)
     size: int
-    # sample_delta read from stts entries,
+
+    # sample_delta read from stts entries that decides when to decode the sample,
     # i.e. STTS(n) in the forumula DT(n+1) = DT(n) + STTS(n)
+    # NOTE: timescale is not applied yet (hence int)
     timedelta: int
-    # sample composition offset,
+
+    # sample composition offset that decides when to present the sample,
     # i.e. CTTS(n) in the forumula CT(n) = DT(n) + CTTS(n).
+    # NOTE: timescale is not applied yet (hence int)
     composition_offset: int
+
     # if it is a sync sample
     is_sync: bool
 
 
-# TODO: can not inherit RawSample?
 class Sample(T.NamedTuple):
-    # copied from RawSample
+    raw_sample: RawSample
 
-    # 1-based index
-    description_idx: int
-    # sample offset
-    offset: int
-    # sample size
-    size: int
-    # sample delta in seconds read from stts entries,
-    # i.e. (STTS(n) / timescale) in the forumula DT(n+1) = DT(n) + STTS(n)
-    timedelta: float
-    # sample composition offset in seconds,
-    # i.e. (CTTS(n) / timescale) in the forumula CT(n) = DT(n) + CTTS(n).
-    composition_offset: float
-    # if it is a sync sample
-    is_sync: bool
+    # accumulated timedelta in seconds, i.e. DT(n) / timescale
+    exact_time: float
+
+    # accumulated composition timedelta in seconds, i.e. CT(n) / timescale
+    exact_composition_time: float
 
-    # extended fields below
+    # exact timedelta in seconds, i.e. STTS(n) / timescale
+    exact_timedelta: float
 
-    # accumulated sample_delta in seconds,
-    # i.e. (DT(n) / timescale) in the forumula DT(n+1) = DT(n) + STTS(n)
-    time_offset: T.Union[int, float]
-    # accumulated composition offset in seconds,
-    # i.e. (CT(n) / timescale) in the forumula CT(n) = DT(n) + CTTS(n).
-    composition_time_offset: T.Union[int, float]
     # reference to the sample description
     description: T.Dict
 
@@ -138,109 +129,28 @@ def _extract_raw_samples(
 def _extract_samples(
     raw_samples: T.Iterator[RawSample],
     descriptions: T.List,
+    timescale: int,
 ) -> T.Generator[Sample, None, None]:
     acc_delta = 0
     for raw_sample in raw_samples:
         yield Sample(
-            description_idx=raw_sample.description_idx,
-            offset=raw_sample.offset,
-            size=raw_sample.size,
-            timedelta=raw_sample.timedelta,
-            composition_offset=raw_sample.composition_offset,
-            is_sync=raw_sample.is_sync,
+            raw_sample=raw_sample,
             description=descriptions[raw_sample.description_idx - 1],
-            time_offset=acc_delta,
+            exact_time=acc_delta / timescale,
+            exact_timedelta=raw_sample.timedelta / timescale,
             # CT(n) = DT(n) + CTTS(n)
-            composition_time_offset=(acc_delta + raw_sample.composition_offset),
+            exact_composition_time=(acc_delta + raw_sample.composition_offset)
+            / timescale,
         )
         acc_delta += raw_sample.timedelta
 
 
-def _apply_timescale(sample: Sample, media_timescale: int) -> Sample:
-    return Sample(
-        description_idx=sample.description_idx,
-        offset=sample.offset,
-        size=sample.size,
-        timedelta=sample.timedelta / media_timescale,
-        composition_offset=sample.composition_offset / media_timescale,
-        is_sync=sample.is_sync,
-        description=sample.description,
-        time_offset=sample.time_offset / media_timescale,
-        composition_time_offset=sample.composition_time_offset / media_timescale,
-    )
-
-
-def parse_raw_samples_from_stbl(
-    stbl: T.BinaryIO,
-    maxsize: int = -1,
-) -> T.Tuple[T.List[T.Dict], T.Generator[RawSample, None, None]]:
-    """
-    DEPRECATED: use parse_raw_samples_from_stbl_bytes instead
-    """
-
-    descriptions = []
-    sizes = []
-    chunk_offsets = []
-    chunk_entries = []
-    timedeltas: T.List[int] = []
-    composition_offsets: T.Optional[T.List[int]] = None
-    syncs: T.Optional[T.Set[int]] = None
-
-    for h, s in parser.parse_boxes(stbl, maxsize=maxsize, extend_eof=False):
-        if h.type == b"stsd":
-            box = cparser.SampleDescriptionBox.parse(s.read(h.maxsize))
-            descriptions = list(box.entries)
-        elif h.type == b"stsz":
-            box = cparser.SampleSizeBox.parse(s.read(h.maxsize))
-            if box.sample_size == 0:
-                sizes = list(box.entries)
-            else:
-                sizes = [box.sample_size for _ in range(box.sample_count)]
-        elif h.type == b"stco":
-            box = cparser.ChunkOffsetBox.parse(s.read(h.maxsize))
-            chunk_offsets = list(box.entries)
-        elif h.type == b"co64":
-            box = cparser.ChunkLargeOffsetBox.parse(s.read(h.maxsize))
-            chunk_offsets = list(box.entries)
-        elif h.type == b"stsc":
-            box = cparser.SampleToChunkBox.parse(s.read(h.maxsize))
-            chunk_entries = list(box.entries)
-        elif h.type == b"stts":
-            timedeltas = []
-            box = cparser.TimeToSampleBox.parse(s.read(h.maxsize))
-            for entry in box.entries:
-                for _ in range(entry.sample_count):
-                    timedeltas.append(entry.sample_delta)
-        elif h.type == b"ctts":
-            composition_offsets = []
-            box = cparser.CompositionTimeToSampleBox.parse(s.read(h.maxsize))
-            for entry in box.entries:
-                for _ in range(entry.sample_count):
-                    composition_offsets.append(entry.sample_offset)
-        elif h.type == b"stss":
-            box = cparser.SyncSampleBox.parse(s.read(h.maxsize))
-            syncs = set(box.entries)
-
-    # some stbl have less timedeltas than the sample count i.e. len(sizes),
-    # in this case append 0's to timedeltas
-    while len(timedeltas) < len(sizes):
-        timedeltas.append(0)
-    if composition_offsets is not None:
-        while len(composition_offsets) < len(sizes):
-            composition_offsets.append(0)
-
-    raw_samples = _extract_raw_samples(
-        sizes, chunk_entries, chunk_offsets, timedeltas, composition_offsets, syncs
-    )
-    return descriptions, raw_samples
-
-
 STBLBoxlistConstruct = cparser.Box64ConstructBuilder(
     T.cast(cparser.SwitchMapType, cparser.CMAP[b"stbl"])
 ).BoxList
 
 
-def parse_raw_samples_from_stbl_bytes(
+def extract_raw_samples_from_stbl_data(
     stbl: bytes,
 ) -> T.Tuple[T.List[T.Dict], T.Generator[RawSample, None, None]]:
     descriptions = []
@@ -251,9 +161,11 @@ def parse_raw_samples_from_stbl_bytes(
     composition_offsets: T.Optional[T.List[int]] = None
     syncs: T.Optional[T.Set[int]] = None
 
-    stbl_boxes = T.cast(T.Sequence[cparser.BoxDict], STBLBoxlistConstruct.parse(stbl))
+    stbl_children = T.cast(
+        T.Sequence[cparser.BoxDict], STBLBoxlistConstruct.parse(stbl)
+    )
 
-    for box in stbl_boxes:
+    for box in stbl_children:
         data: T.Dict = T.cast(T.Dict, box["data"])
 
         if box["type"] == b"stsd":
@@ -296,124 +208,108 @@ def parse_raw_samples_from_stbl_bytes(
     return descriptions, raw_samples
 
 
-def parse_descriptions_from_trak(trak: T.BinaryIO, maxsize: int = -1) -> T.List[T.Dict]:
-    data = parser.parse_box_data_first(
-        trak, [b"mdia", b"minf", b"stbl", b"stsd"], maxsize=maxsize
-    )
-    if data is None:
-        return []
-    box = cparser.SampleDescriptionBox.parse(data)
-    return list(box.entries)
-
-
-def parse_samples_from_trak(
-    trak: T.BinaryIO,
-    maxsize: int = -1,
-) -> T.Generator[Sample, None, None]:
-    trak_start_offset = trak.tell()
-
-    trak.seek(trak_start_offset, io.SEEK_SET)
-    mdhd_box = parser.parse_box_data_firstx(trak, [b"mdia", b"mdhd"], maxsize=maxsize)
-    mdhd = T.cast(T.Dict, cparser.MediaHeaderBox.parse(mdhd_box))
-
-    trak.seek(trak_start_offset, io.SEEK_SET)
-    h, s = parser.parse_box_path_firstx(
-        trak, [b"mdia", b"minf", b"stbl"], maxsize=maxsize
-    )
-    descriptions, raw_samples = parse_raw_samples_from_stbl(s, maxsize=h.maxsize)
-
-    yield from (
-        _apply_timescale(s, mdhd["timescale"])
-        for s in _extract_samples(raw_samples, descriptions)
-    )
-
-
-STSDBoxListConstruct = cparser.Box64ConstructBuilder(
+_STSDBoxListConstruct = cparser.Box64ConstructBuilder(
     # pyre-ignore[6]: pyre does not support recursive type SwitchMapType
     {b"stsd": cparser.CMAP[b"stsd"]}
 ).BoxList
 
 
 class TrackBoxParser:
-    trak_boxes: T.Sequence[cparser.BoxDict]
+    trak_children: T.Sequence[cparser.BoxDict]
     stbl_data: bytes
 
-    def __init__(self, trak_boxes: T.Sequence[cparser.BoxDict]):
-        self.trak_boxes = trak_boxes
-        stbl = cparser.find_box_at_pathx(self.trak_boxes, [b"mdia", b"minf", b"stbl"])
+    def __init__(self, trak_children: T.Sequence[cparser.BoxDict]):
+        self.trak_children = trak_children
+        stbl = cparser.find_box_at_pathx(
+            self.trak_children, [b"mdia", b"minf", b"stbl"]
+        )
         self.stbl_data = T.cast(bytes, stbl["data"])
 
-    def tkhd(self) -> T.Dict:
+    def extract_tkhd_boxdata(self) -> T.Dict:
         return T.cast(
-            T.Dict, cparser.find_box_at_pathx(self.trak_boxes, [b"tkhd"])["data"]
+            T.Dict, cparser.find_box_at_pathx(self.trak_children, [b"tkhd"])["data"]
         )
 
     def is_video_track(self) -> bool:
-        hdlr = cparser.find_box_at_pathx(self.trak_boxes, [b"mdia", b"hdlr"])
+        hdlr = cparser.find_box_at_pathx(self.trak_children, [b"mdia", b"hdlr"])
         return T.cast(T.Dict[str, T.Any], hdlr["data"])["handler_type"] == b"vide"
 
-    def parse_sample_description(self) -> T.Dict:
-        boxes = STSDBoxListConstruct.parse(self.stbl_data)
+    def extract_sample_descriptions(self) -> T.List[T.Dict]:
+        # TODO: return [] if parsing fail
+        boxes = _STSDBoxListConstruct.parse(self.stbl_data)
         stsd = cparser.find_box_at_pathx(
             T.cast(T.Sequence[cparser.BoxDict], boxes), [b"stsd"]
         )
-        return T.cast(T.Dict, stsd["data"])
+        return T.cast(T.List[T.Dict], T.cast(T.Dict, stsd["data"])["entries"])
+
+    def extract_elst_boxdata(self) -> T.Optional[T.Dict]:
+        box = cparser.find_box_at_path(self.trak_children, [b"edts", b"elst"])
+        if box is None:
+            return None
+        return T.cast(T.Dict, box["data"])
+
+    def extract_mdhd_boxdata(self) -> T.Dict:
+        box = cparser.find_box_at_pathx(self.trak_children, [b"mdia", b"mdhd"])
+        return T.cast(T.Dict, box["data"])
 
-    def parse_raw_samples(self) -> T.Generator[RawSample, None, None]:
-        _, raw_samples = parse_raw_samples_from_stbl_bytes(self.stbl_data)
+    def extract_raw_samples(self) -> T.Generator[RawSample, None, None]:
+        _, raw_samples = extract_raw_samples_from_stbl_data(self.stbl_data)
         yield from raw_samples
 
-    def parse_samples(self) -> T.Generator[Sample, None, None]:
-        descriptions, raw_samples = parse_raw_samples_from_stbl_bytes(self.stbl_data)
+    def extract_samples(self) -> T.Generator[Sample, None, None]:
+        descriptions, raw_samples = extract_raw_samples_from_stbl_data(self.stbl_data)
         mdhd = T.cast(
             T.Dict,
-            cparser.find_box_at_pathx(self.trak_boxes, [b"mdia", b"mdhd"])["data"],
-        )
-        yield from (
-            _apply_timescale(s, mdhd["timescale"])
-            for s in _extract_samples(raw_samples, descriptions)
+            cparser.find_box_at_pathx(self.trak_children, [b"mdia", b"mdhd"])["data"],
         )
+        yield from _extract_samples(raw_samples, descriptions, mdhd["timescale"])
 
 
 class MovieBoxParser:
-    moov_boxes: T.Sequence[cparser.BoxDict]
+    moov_children: T.Sequence[cparser.BoxDict]
 
-    def __init__(self, moov: bytes):
-        self.moov_boxes = T.cast(
+    def __init__(self, moov_data: bytes):
+        self.moov_children = T.cast(
             T.Sequence[cparser.BoxDict],
-            cparser.MOOVWithoutSTBLBuilderConstruct.BoxList.parse(moov),
+            cparser.MOOVWithoutSTBLBuilderConstruct.BoxList.parse(moov_data),
         )
 
     @classmethod
     def parse_file(cls, video_path: Path) -> "MovieBoxParser":
         with video_path.open("rb") as fp:
-            moov = parser.parse_box_data_firstx(fp, [b"moov"])
+            moov = sparser.parse_box_data_firstx(fp, [b"moov"])
+        return MovieBoxParser(moov)
+
+    @classmethod
+    def parse_stream(cls, stream: T.BinaryIO) -> "MovieBoxParser":
+        moov = sparser.parse_box_data_firstx(stream, [b"moov"])
         return MovieBoxParser(moov)
 
-    def mvhd(self):
-        mvhd = cparser.find_box_at_pathx(self.moov_boxes, [b"mvhd"])
-        return mvhd["data"]
+    def extract_mvhd_boxdata(self) -> T.Dict:
+        mvhd = cparser.find_box_at_pathx(self.moov_children, [b"mvhd"])
+        return T.cast(T.Dict, mvhd["data"])
 
-    def parse_tracks(self) -> T.Generator[TrackBoxParser, None, None]:
-        for box in self.moov_boxes:
+    def extract_tracks(self) -> T.Generator[TrackBoxParser, None, None]:
+        for box in self.moov_children:
             if box["type"] == b"trak":
                 yield TrackBoxParser(T.cast(T.Sequence[cparser.BoxDict], box["data"]))
 
-    def parse_track_at(self, stream_idx: int) -> TrackBoxParser:
+    def extract_track_at(self, stream_idx: int) -> TrackBoxParser:
         """
         stream_idx should be the stream_index specifier. See http://ffmpeg.org/ffmpeg.html#Stream-specifiers-1
         > Stream numbering is based on the order of the streams as detected by libavformat
         """
-        trak_boxes = [box for box in self.moov_boxes if box["type"] == b"trak"]
+        trak_boxes = [box for box in self.moov_children if box["type"] == b"trak"]
         if not (0 <= stream_idx < len(trak_boxes)):
             raise IndexError(
                 "unable to read stream at %d from the track list (length %d)",
                 stream_idx,
                 len(trak_boxes),
             )
-        return TrackBoxParser(
-            T.cast(T.Sequence[cparser.BoxDict], trak_boxes[stream_idx]["data"])
+        trak_children = T.cast(
+            T.Sequence[cparser.BoxDict], trak_boxes[stream_idx]["data"]
         )
+        return TrackBoxParser(trak_children)
 
 
 _DT_1904 = datetime.datetime.utcfromtimestamp(0).replace(year=1904)
diff --git a/mapillary_tools/geotag/simple_mp4_parser.py b/mapillary_tools/mp4/simple_mp4_parser.py
similarity index 100%
rename from mapillary_tools/geotag/simple_mp4_parser.py
rename to mapillary_tools/mp4/simple_mp4_parser.py
diff --git a/mapillary_tools/sample_video.py b/mapillary_tools/sample_video.py
index e70a370b..65d1baa7 100644
--- a/mapillary_tools/sample_video.py
+++ b/mapillary_tools/sample_video.py
@@ -9,7 +9,8 @@
 
 from . import constants, exceptions, ffmpeg as ffmpeglib, geo, types, utils
 from .exif_write import ExifEdit
-from .geotag import geotag_videos_from_video, mp4_sample_parser
+from .geotag import geotag_videos_from_video
+from .mp4 import mp4_sample_parser
 from .process_geotag_properties import GeotagSource
 
 LOG = logging.getLogger(__name__)
@@ -234,10 +235,10 @@ def _sample_video_stream_by_distance(
     """
 
     LOG.info("Extracting video samples")
-    sorted_samples = list(video_track_parser.parse_samples())
+    sorted_samples = list(video_track_parser.extract_samples())
     # we need sort sampels by composition time (CT) not the decoding offset (DT)
     # CT is the oder of videos streaming to audiences, as well as the order ffmpeg sampling
-    sorted_samples.sort(key=lambda sample: sample.composition_time_offset)
+    sorted_samples.sort(key=lambda sample: sample.exact_composition_time)
     LOG.info("Found total %d video samples", len(sorted_samples))
 
     # interpolate sample points between the GPS track range (with 1ms buffer)
@@ -251,11 +252,11 @@ def _sample_video_stream_by_distance(
         (
             frame_idx_0based,
             video_sample,
-            interpolator.interpolate(video_sample.composition_time_offset),
+            interpolator.interpolate(video_sample.exact_composition_time),
         )
         for frame_idx_0based, video_sample in enumerate(sorted_samples)
         if _within_track_time_range_buffered(
-            points, video_sample.composition_time_offset
+            points, video_sample.exact_composition_time
         )
     ]
     LOG.info("Found total %d interpolated video samples", len(interp_sample_points))
@@ -316,7 +317,7 @@ def _sample_single_video_by_distance(
     LOG.info("Extracting video samples")
     video_stream_idx = video_stream["index"]
     moov_parser = mp4_sample_parser.MovieBoxParser.parse_file(video_path)
-    video_track_parser = moov_parser.parse_track_at(video_stream_idx)
+    video_track_parser = moov_parser.extract_track_at(video_stream_idx)
     sample_points_by_frame_idx = _sample_video_stream_by_distance(
         video_metadata.points, video_track_parser, sample_distance
     )
@@ -352,8 +353,8 @@ def _sample_single_video_by_distance(
 
             video_sample, interp = sample_points_by_frame_idx[sample_idx]
             assert (
-                interp.time == video_sample.composition_time_offset
-            ), f"interpolated time {interp.time} should match the video sample time {video_sample.composition_time_offset}"
+                interp.time == video_sample.exact_composition_time
+            ), f"interpolated time {interp.time} should match the video sample time {video_sample.exact_composition_time}"
 
             timestamp = start_time + datetime.timedelta(seconds=interp.time)
             exif_edit = ExifEdit(sample_paths[0])
diff --git a/mapillary_tools/video_data_extraction/extractors/blackvue_parser.py b/mapillary_tools/video_data_extraction/extractors/blackvue_parser.py
index 7f088677..9aef060f 100644
--- a/mapillary_tools/video_data_extraction/extractors/blackvue_parser.py
+++ b/mapillary_tools/video_data_extraction/extractors/blackvue_parser.py
@@ -1,8 +1,9 @@
 import typing as T
 
-from mapillary_tools import geo
-from mapillary_tools.geotag import blackvue_parser, simple_mp4_parser
-from mapillary_tools.video_data_extraction.extractors.base_parser import BaseParser
+from ... import geo
+from ...geotag import blackvue_parser
+from ...mp4 import simple_mp4_parser as sparser
+from .base_parser import BaseParser
 
 
 class BlackVueParser(BaseParser):
@@ -21,7 +22,7 @@ def extract_points(self) -> T.Sequence[geo.Point]:
                 points = blackvue_parser.extract_points(fp) or []
                 self.pointsFound = len(points) > 0
                 return points
-            except simple_mp4_parser.ParsingError:
+            except sparser.ParsingError:
                 return []
 
     def extract_make(self) -> T.Optional[str]:
diff --git a/mapillary_tools/video_data_extraction/extractors/camm_parser.py b/mapillary_tools/video_data_extraction/extractors/camm_parser.py
index 98e0b8d6..122a0ca5 100644
--- a/mapillary_tools/video_data_extraction/extractors/camm_parser.py
+++ b/mapillary_tools/video_data_extraction/extractors/camm_parser.py
@@ -1,9 +1,10 @@
 import functools
 import typing as T
 
-from mapillary_tools import geo
-from mapillary_tools.geotag import camm_parser, simple_mp4_parser
-from mapillary_tools.video_data_extraction.extractors.base_parser import BaseParser
+from ... import geo
+from ...geotag import camm_parser
+from ...mp4 import simple_mp4_parser as sparser
+from .base_parser import BaseParser
 
 
 class CammParser(BaseParser):
@@ -23,7 +24,7 @@ def extract_points(self) -> T.Sequence[geo.Point]:
         with source_path.open("rb") as fp:
             try:
                 return camm_parser.extract_points(fp) or []
-            except simple_mp4_parser.ParsingError:
+            except sparser.ParsingError:
                 return []
 
     def extract_make(self) -> T.Optional[str]:
diff --git a/mapillary_tools/video_data_extraction/extractors/gopro_parser.py b/mapillary_tools/video_data_extraction/extractors/gopro_parser.py
index 3a4c3efd..77e488ad 100644
--- a/mapillary_tools/video_data_extraction/extractors/gopro_parser.py
+++ b/mapillary_tools/video_data_extraction/extractors/gopro_parser.py
@@ -1,8 +1,9 @@
 import typing as T
 
-from mapillary_tools import geo
-from mapillary_tools.geotag import gpmf_parser, simple_mp4_parser
-from mapillary_tools.video_data_extraction.extractors.base_parser import BaseParser
+from ... import geo
+from ...geotag import gpmf_parser
+from ...mp4 import simple_mp4_parser as sparser
+from .base_parser import BaseParser
 
 
 class GoProParser(BaseParser):
@@ -21,7 +22,7 @@ def extract_points(self) -> T.Sequence[geo.Point]:
                 points = gpmf_parser.extract_points(fp) or []
                 self.pointsFound = len(points) > 0
                 return points
-            except simple_mp4_parser.ParsingError:
+            except sparser.ParsingError:
                 return []
 
     def extract_make(self) -> T.Optional[str]:
diff --git a/setup.py b/setup.py
index 74b9a348..2d09b295 100644
--- a/setup.py
+++ b/setup.py
@@ -46,6 +46,7 @@ def readme():
         "mapillary_tools",
         "mapillary_tools.commands",
         "mapillary_tools.geotag",
+        "mapillary_tools.mp4",
         "mapillary_tools.video_data_extraction",
         "mapillary_tools.video_data_extraction.extractors",
     ],
diff --git a/tests/cli/simple_mp4_parser.py b/tests/cli/simple_mp4_parser.py
index 3fd6ae52..da8e3f29 100644
--- a/tests/cli/simple_mp4_parser.py
+++ b/tests/cli/simple_mp4_parser.py
@@ -6,10 +6,10 @@
 import typing as T
 
 from mapillary_tools import utils
-from mapillary_tools.geotag import (
+from mapillary_tools.mp4 import (
     construct_mp4_parser as cparser,
     mp4_sample_parser as sample_parser,
-    simple_mp4_parser as parser,
+    simple_mp4_parser as sparser,
 )
 
 LOG = logging.getLogger(__name__)
@@ -37,13 +37,15 @@ def _validate_samples(
     samples: T.List[sample_parser.RawSample] = []
 
     with open(path, "rb") as fp:
-        for h, s in parser.parse_path(
+        for h, s in sparser.parse_path(
             fp, [b"moov", b"trak", b"mdia", b"minf", b"stbl"]
         ):
             (
                 descriptions,
                 raw_samples,
-            ) = sample_parser.parse_raw_samples_from_stbl(s, maxsize=h.maxsize)
+            ) = sample_parser.parse_raw_samples_from_stbl_DEPRECATED(
+                s, maxsize=h.maxsize
+            )
             samples.extend(
                 sample
                 for sample in raw_samples
@@ -67,7 +69,7 @@ def _validate_samples(
 
 
 def _parse_structs(fp: T.BinaryIO):
-    for h, d, s in parser.parse_boxes_recursive(fp, box_list_types=box_list_types):
+    for h, d, s in sparser.parse_boxes_recursive(fp, box_list_types=box_list_types):
         margin = "\t" * d
         if h.size32 == 0:
             header = f"{str(h.type)} {h.box_size} (open-ended):"
@@ -86,7 +88,7 @@ def _parse_structs(fp: T.BinaryIO):
 
 
 def _dump_box_data_at(fp: T.BinaryIO, box_type_path: T.List[bytes]):
-    for h, s in parser.parse_path(fp, box_type_path):
+    for h, s in sparser.parse_path(fp, box_type_path):
         max_chunk_size = 1024
         read = 0
         while read < h.maxsize or h.maxsize == -1:
@@ -103,22 +105,26 @@ def _dump_box_data_at(fp: T.BinaryIO, box_type_path: T.List[bytes]):
 
 
 def _parse_samples(fp: T.BinaryIO, filters: T.Optional[T.Container[bytes]] = None):
-    for h, s in parser.parse_path(fp, [b"moov", b"trak"]):
+    for h, s in sparser.parse_path(fp, [b"moov", b"trak"]):
         offset = s.tell()
-        for h1, s1 in parser.parse_path(s, [b"mdia", b"mdhd"], maxsize=h.maxsize):
+        for h1, s1 in sparser.parse_path(s, [b"mdia", b"mdhd"], maxsize=h.maxsize):
             box = cparser.MediaHeaderBox.parse(s1.read(h.maxsize))
             LOG.info(box)
             LOG.info(sample_parser.to_datetime(box.creation_time))
             LOG.info(box.duration / box.timescale)
         s.seek(offset, io.SEEK_SET)
-        for sample in sample_parser.parse_samples_from_trak(s, maxsize=h.maxsize):
+        for sample in sample_parser.parse_samples_from_trak_DEPRECATED(
+            s, maxsize=h.maxsize
+        ):
             if filters is None or sample.description["format"] in filters:
                 print(sample)
 
 
 def _dump_samples(fp: T.BinaryIO, filters: T.Optional[T.Container[bytes]] = None):
-    for h, s in parser.parse_path(fp, [b"moov", b"trak"]):
-        for sample in sample_parser.parse_samples_from_trak(s, maxsize=h.maxsize):
+    for h, s in sparser.parse_path(fp, [b"moov", b"trak"]):
+        for sample in sample_parser.parse_samples_from_trak_DEPRECATED(
+            s, maxsize=h.maxsize
+        ):
             if filters is None or sample.description["format"] in filters:
                 fp.seek(sample.offset, io.SEEK_SET)
                 data = fp.read(sample.size)
@@ -203,13 +209,13 @@ def _process_path(parsed_args, path: pathlib.Path):
                     if box_path is None:
                         _parse_structs(fp)
                     else:
-                        data = parser.parse_mp4_data_firstx(fp, box_path)
+                        data = sparser.parse_mp4_data_firstx(fp, box_path)
                         _parse_structs(io.BytesIO(data))
                 elif parsed_args.full:
                     if box_path is None:
                         boxes = cparser.MP4ParserConstruct.BoxList.parse_stream(fp)
                     else:
-                        data = parser.parse_mp4_data_firstx(fp, box_path)
+                        data = sparser.parse_mp4_data_firstx(fp, box_path)
                         boxes = cparser.MP4ParserConstruct.BoxList.parse_stream(
                             io.BytesIO(data)
                         )
@@ -222,7 +228,7 @@ def _process_path(parsed_args, path: pathlib.Path):
                             )
                         )
                     else:
-                        data = parser.parse_mp4_data_firstx(fp, box_path)
+                        data = sparser.parse_mp4_data_firstx(fp, box_path)
                         boxes = (
                             cparser.MP4WithoutSTBLParserConstruct.BoxList.parse_stream(
                                 io.BytesIO(data)
diff --git a/tests/unit/test_blackvue_parser.py b/tests/unit/test_blackvue_parser.py
index 9ec65a45..0832a739 100644
--- a/tests/unit/test_blackvue_parser.py
+++ b/tests/unit/test_blackvue_parser.py
@@ -2,7 +2,8 @@
 
 import mapillary_tools.geo as geo
 
-from mapillary_tools.geotag import blackvue_parser, construct_mp4_parser as cparser
+from mapillary_tools.geotag import blackvue_parser
+from mapillary_tools.mp4 import construct_mp4_parser as cparser
 
 
 def test_parse_points():
diff --git a/tests/unit/test_camm_parser.py b/tests/unit/test_camm_parser.py
index ca22b571..ed323783 100644
--- a/tests/unit/test_camm_parser.py
+++ b/tests/unit/test_camm_parser.py
@@ -7,9 +7,9 @@
 from mapillary_tools.geotag import (
     camm_builder,
     camm_parser,
-    construct_mp4_parser as cparser,
     simple_mp4_builder,
 )
+from mapillary_tools.mp4 import construct_mp4_parser as cparser
 
 
 def test_filter_points_by_edit_list():
diff --git a/tests/unit/test_mp4_sample_parser.py b/tests/unit/test_mp4_sample_parser.py
index 003fae37..6e561fcb 100644
--- a/tests/unit/test_mp4_sample_parser.py
+++ b/tests/unit/test_mp4_sample_parser.py
@@ -1,19 +1,19 @@
 from pathlib import Path
 
-from mapillary_tools.geotag import mp4_sample_parser
+from mapillary_tools.mp4 import mp4_sample_parser
 
 
 def test_movie_box_parser():
     moov_parser = mp4_sample_parser.MovieBoxParser.parse_file(
         Path("tests/data/videos/sample-5s.mp4")
     )
-    assert 2 == len(list(moov_parser.parse_tracks()))
-    video_track = moov_parser.parse_track_at(0)
+    assert 2 == len(list(moov_parser.extract_tracks()))
+    video_track = moov_parser.extract_track_at(0)
     assert video_track.is_video_track()
-    aac_track = moov_parser.parse_track_at(1)
+    aac_track = moov_parser.extract_track_at(1)
     assert not aac_track.is_video_track()
-    samples = list(video_track.parse_samples())
-    raw_samples = list(video_track.parse_raw_samples())
+    samples = list(video_track.extract_samples())
+    raw_samples = list(video_track.extract_raw_samples())
     assert 171 == len(samples)
     assert len(samples) == len(raw_samples)
     assert {
@@ -31,7 +31,7 @@ def test_movie_box_parser():
         "height": 70778880,
     } == {
         k: v
-        for k, v in video_track.tkhd().items()
+        for k, v in video_track.extract_tkhd_boxdata().items()
         if k
         in [
             "version",
@@ -47,8 +47,8 @@ def test_movie_box_parser():
             "height",
         ]
     }
-    assert isinstance(video_track.tkhd(), dict)
+    assert isinstance(video_track.extract_tkhd_boxdata(), dict)
     for sample, raw_sample in zip(samples, raw_samples):
-        assert sample.offset == raw_sample.offset
-        assert sample.is_sync == raw_sample.is_sync
-        assert sample.size == raw_sample.size
+        assert sample.raw_sample.offset == raw_sample.offset
+        assert sample.raw_sample.is_sync == raw_sample.is_sync
+        assert sample.raw_sample.size == raw_sample.size
diff --git a/tests/unit/test_simple_mp4_builder.py b/tests/unit/test_simple_mp4_builder.py
index 88b00cad..8bd67e7d 100644
--- a/tests/unit/test_simple_mp4_builder.py
+++ b/tests/unit/test_simple_mp4_builder.py
@@ -2,10 +2,12 @@
 import typing as T
 
 from mapillary_tools.geotag import (
+    simple_mp4_builder as builder,
+)
+from mapillary_tools.mp4 import (
     construct_mp4_parser as cparser,
     mp4_sample_parser as sample_parser,
-    simple_mp4_builder as builder,
-    simple_mp4_parser as parser,
+    simple_mp4_parser as sparser,
 )
 
 
@@ -44,9 +46,9 @@ def _build_and_parse_stbl(
     d = cparser.Box32ConstructBuilder({b"stbl": cparser.CMAP[b"stbl"]}).Box.build(
         {"type": b"stbl", "data": s}
     )
-    ss = parser.parse_box_data_firstx(io.BytesIO(d), [b"stbl"])
+    ss = sparser.parse_box_data_firstx(io.BytesIO(d), [b"stbl"])
     assert d[8:] == ss
-    _, parsed_samples = sample_parser.parse_raw_samples_from_stbl(io.BytesIO(ss))
+    _, parsed_samples = sample_parser.extract_raw_samples_from_stbl_data(ss)
     assert expected_samples == list(parsed_samples)
 
 
@@ -247,80 +249,82 @@ def test_parse_raw_samples_from_stbl():
             },
         ]
     )
-    descs, sample_iter = sample_parser.parse_raw_samples_from_stbl(
-        io.BytesIO(stbl_bytes)
-    )
-    samples = list(sample_iter)
-    assert [
-        sample_parser.RawSample(
-            description_idx=1,
-            offset=1,
-            size=1,
-            timedelta=20,
-            composition_offset=0,
-            is_sync=True,
-        ),
-        sample_parser.RawSample(
-            description_idx=1,
-            offset=2,
-            size=2,
-            timedelta=30,
-            composition_offset=0,
-            is_sync=False,
-        ),
-        sample_parser.RawSample(
-            description_idx=1,
-            offset=5,
-            size=3,
-            timedelta=30,
-            composition_offset=0,
-            is_sync=True,
-        ),
-        sample_parser.RawSample(
-            description_idx=1,
-            offset=8,
-            size=3,
-            timedelta=50,
-            composition_offset=0,
-            is_sync=False,
-        ),
-    ] == samples
-    d = builder.build_stbl_from_raw_samples(descs, samples)
-    assert d[1:] == [
-        {
-            "data": {
-                "entries": [
-                    {"sample_count": 1, "sample_delta": 20},
-                    {"sample_count": 2, "sample_delta": 30},
-                    {"sample_count": 1, "sample_delta": 50},
-                ]
+
+    def _verify_samples(descs, samples):
+        assert [
+            sample_parser.RawSample(
+                description_idx=1,
+                offset=1,
+                size=1,
+                timedelta=20,
+                composition_offset=0,
+                is_sync=True,
+            ),
+            sample_parser.RawSample(
+                description_idx=1,
+                offset=2,
+                size=2,
+                timedelta=30,
+                composition_offset=0,
+                is_sync=False,
+            ),
+            sample_parser.RawSample(
+                description_idx=1,
+                offset=5,
+                size=3,
+                timedelta=30,
+                composition_offset=0,
+                is_sync=True,
+            ),
+            sample_parser.RawSample(
+                description_idx=1,
+                offset=8,
+                size=3,
+                timedelta=50,
+                composition_offset=0,
+                is_sync=False,
+            ),
+        ] == samples
+        d = builder.build_stbl_from_raw_samples(descs, samples)
+        assert d[1:] == [
+            {
+                "data": {
+                    "entries": [
+                        {"sample_count": 1, "sample_delta": 20},
+                        {"sample_count": 2, "sample_delta": 30},
+                        {"sample_count": 1, "sample_delta": 50},
+                    ]
+                },
+                "type": b"stts",
             },
-            "type": b"stts",
-        },
-        {
-            "data": {
-                "entries": [
-                    {
-                        "first_chunk": 1,
-                        "sample_description_index": 1,
-                        "samples_per_chunk": 2,
-                    },
-                    {
-                        "first_chunk": 2,
-                        "sample_description_index": 1,
-                        "samples_per_chunk": 2,
-                    },
-                ]
+            {
+                "data": {
+                    "entries": [
+                        {
+                            "first_chunk": 1,
+                            "sample_description_index": 1,
+                            "samples_per_chunk": 2,
+                        },
+                        {
+                            "first_chunk": 2,
+                            "sample_description_index": 1,
+                            "samples_per_chunk": 2,
+                        },
+                    ]
+                },
+                "type": b"stsc",
             },
-            "type": b"stsc",
-        },
-        {
-            "data": {"entries": [1, 2, 3, 3], "sample_count": 4, "sample_size": 0},
-            "type": b"stsz",
-        },
-        {"data": {"entries": [1, 5]}, "type": b"co64"},
-        {"data": {"entries": [1, 3]}, "type": b"stss"},
-    ]
+            {
+                "data": {"entries": [1, 2, 3, 3], "sample_count": 4, "sample_size": 0},
+                "type": b"stsz",
+            },
+            {"data": {"entries": [1, 5]}, "type": b"co64"},
+            {"data": {"entries": [1, 3]}, "type": b"stss"},
+        ]
+
+    descs, sample_iter = sample_parser.extract_raw_samples_from_stbl_data(stbl_bytes)
+    samples = list(sample_iter)
+    _verify_samples(descs, samples)
 
 
 def test_box_header_0_building():
diff --git a/tests/unit/test_simple_mp4_parser.py b/tests/unit/test_simple_mp4_parser.py
index 19701398..5b375842 100644
--- a/tests/unit/test_simple_mp4_parser.py
+++ b/tests/unit/test_simple_mp4_parser.py
@@ -1,9 +1,9 @@
 import io
 import typing
 
-from mapillary_tools.geotag import (
+from mapillary_tools.mp4 import (
     construct_mp4_parser as cparser,
-    simple_mp4_parser as parser,
+    simple_mp4_parser as sparser,
 )
 
 
@@ -26,7 +26,7 @@ def _parse(data: bytes):
     }
     consumed_size = 0
     ret = []
-    for h, _d, s in parser.parse_boxes_recursive(
+    for h, _d, s in sparser.parse_boxes_recursive(
         io.BytesIO(data), box_list_types=box_list_types
     ):
         box_data = s.read(h.maxsize)
@@ -42,7 +42,7 @@ def _parse(data: bytes):
 
 def _assert_box_type(
     data: bytes,
-    parsed: typing.List[typing.Tuple[parser.Header, bytes]],
+    parsed: typing.List[typing.Tuple[sparser.Header, bytes]],
     box_type: bytes,
 ):
     assert 1 == len(parsed)
@@ -55,7 +55,7 @@ def _assert_box_type(
 
 def test_parse_box_header():
     s = io.BytesIO(b"hello")
-    header = parser.parse_box_header(s, maxsize=0)
+    header = sparser.parse_box_header(s, maxsize=0)
     assert header.header_size == 0
     assert header.box_size == 0
     assert header.type == b""