Home Reference Source

src/remux/mp4-remuxer.ts

  1. import AAC from './aac-helper';
  2. import MP4 from './mp4-generator';
  3. import type { HlsEventEmitter } from '../events';
  4. import { Events } from '../events';
  5. import { ErrorTypes, ErrorDetails } from '../errors';
  6. import { logger } from '../utils/logger';
  7. import {
  8. InitSegmentData,
  9. Remuxer,
  10. RemuxerResult,
  11. RemuxedMetadata,
  12. RemuxedTrack,
  13. RemuxedUserdata,
  14. } from '../types/remuxer';
  15. import { PlaylistLevelType } from '../types/loader';
  16. import { toMsFromMpegTsClock } from '../utils/timescale-conversion';
  17. import type {
  18. AudioSample,
  19. AvcSample,
  20. DemuxedAudioTrack,
  21. DemuxedAvcTrack,
  22. DemuxedMetadataTrack,
  23. DemuxedUserdataTrack,
  24. } from '../types/demuxer';
  25. import type { TrackSet } from '../types/track';
  26. import type { SourceBufferName } from '../types/buffer';
  27. import type { Fragment } from '../loader/fragment';
  28. import type { HlsConfig } from '../config';
  29.  
  30. const MAX_SILENT_FRAME_DURATION = 10 * 1000; // 10 seconds
  31. const AAC_SAMPLES_PER_FRAME = 1024;
  32. const MPEG_AUDIO_SAMPLE_PER_FRAME = 1152;
  33.  
  34. let chromeVersion: number | null = null;
  35. let safariWebkitVersion: number | null = null;
  36. let requiresPositiveDts: boolean = false;
  37.  
  38. export default class MP4Remuxer implements Remuxer {
  39. private observer: HlsEventEmitter;
  40. private config: HlsConfig;
  41. private typeSupported: any;
  42. private ISGenerated: boolean = false;
  43. private _initPTS!: number;
  44. private _initDTS!: number;
  45. private nextAvcDts: number | null = null;
  46. private nextAudioPts: number | null = null;
  47. private isAudioContiguous: boolean = false;
  48. private isVideoContiguous: boolean = false;
  49.  
  50. constructor(
  51. observer: HlsEventEmitter,
  52. config: HlsConfig,
  53. typeSupported,
  54. vendor = ''
  55. ) {
  56. this.observer = observer;
  57. this.config = config;
  58. this.typeSupported = typeSupported;
  59. this.ISGenerated = false;
  60.  
  61. if (chromeVersion === null) {
  62. const userAgent = navigator.userAgent || '';
  63. const result = userAgent.match(/Chrome\/(\d+)/i);
  64. chromeVersion = result ? parseInt(result[1]) : 0;
  65. }
  66. if (safariWebkitVersion === null) {
  67. const result = navigator.userAgent.match(/Safari\/(\d+)/i);
  68. safariWebkitVersion = result ? parseInt(result[1]) : 0;
  69. }
  70. requiresPositiveDts =
  71. (!!chromeVersion && chromeVersion < 75) ||
  72. (!!safariWebkitVersion && safariWebkitVersion < 600);
  73. }
  74.  
  75. destroy() {}
  76.  
  77. resetTimeStamp(defaultTimeStamp) {
  78. logger.log('[mp4-remuxer]: initPTS & initDTS reset');
  79. this._initPTS = this._initDTS = defaultTimeStamp;
  80. }
  81.  
  82. resetNextTimestamp() {
  83. logger.log('[mp4-remuxer]: reset next timestamp');
  84. this.isVideoContiguous = false;
  85. this.isAudioContiguous = false;
  86. }
  87.  
  88. resetInitSegment() {
  89. logger.log('[mp4-remuxer]: ISGenerated flag reset');
  90. this.ISGenerated = false;
  91. }
  92.  
  93. getVideoStartPts(videoSamples) {
  94. let rolloverDetected = false;
  95. const startPTS = videoSamples.reduce((minPTS, sample) => {
  96. const delta = sample.pts - minPTS;
  97. if (delta < -4294967296) {
  98. // 2^32, see PTSNormalize for reasoning, but we're hitting a rollover here, and we don't want that to impact the timeOffset calculation
  99. rolloverDetected = true;
  100. return normalizePts(minPTS, sample.pts);
  101. } else if (delta > 0) {
  102. return minPTS;
  103. } else {
  104. return sample.pts;
  105. }
  106. }, videoSamples[0].pts);
  107. if (rolloverDetected) {
  108. logger.debug('PTS rollover detected');
  109. }
  110. return startPTS;
  111. }
  112.  
  113. remux(
  114. audioTrack: DemuxedAudioTrack,
  115. videoTrack: DemuxedAvcTrack,
  116. id3Track: DemuxedMetadataTrack,
  117. textTrack: DemuxedUserdataTrack,
  118. timeOffset: number,
  119. accurateTimeOffset: boolean,
  120. flush: boolean,
  121. playlistType: PlaylistLevelType
  122. ): RemuxerResult {
  123. let video: RemuxedTrack | undefined;
  124. let audio: RemuxedTrack | undefined;
  125. let initSegment: InitSegmentData | undefined;
  126. let text: RemuxedUserdata | undefined;
  127. let id3: RemuxedMetadata | undefined;
  128. let independent: boolean | undefined;
  129. let audioTimeOffset = timeOffset;
  130. let videoTimeOffset = timeOffset;
  131.  
  132. // If we're remuxing audio and video progressively, wait until we've received enough samples for each track before proceeding.
  133. // This is done to synchronize the audio and video streams. We know if the current segment will have samples if the "pid"
  134. // parameter is greater than -1. The pid is set when the PMT is parsed, which contains the tracks list.
  135. // However, if the initSegment has already been generated, or we've reached the end of a segment (flush),
  136. // then we can remux one track without waiting for the other.
  137. const hasAudio = audioTrack.pid > -1;
  138. const hasVideo = videoTrack.pid > -1;
  139. const length = videoTrack.samples.length;
  140. const enoughAudioSamples = audioTrack.samples.length > 0;
  141. const enoughVideoSamples = length > 1;
  142. const canRemuxAvc =
  143. ((!hasAudio || enoughAudioSamples) &&
  144. (!hasVideo || enoughVideoSamples)) ||
  145. this.ISGenerated ||
  146. flush;
  147.  
  148. if (canRemuxAvc) {
  149. if (!this.ISGenerated) {
  150. initSegment = this.generateIS(audioTrack, videoTrack, timeOffset);
  151. }
  152.  
  153. const isVideoContiguous = this.isVideoContiguous;
  154. let firstKeyFrameIndex = -1;
  155.  
  156. if (enoughVideoSamples) {
  157. firstKeyFrameIndex = findKeyframeIndex(videoTrack.samples);
  158. if (!isVideoContiguous && this.config.forceKeyFrameOnDiscontinuity) {
  159. independent = true;
  160. if (firstKeyFrameIndex > 0) {
  161. logger.warn(
  162. `[mp4-remuxer]: Dropped ${firstKeyFrameIndex} out of ${length} video samples due to a missing keyframe`
  163. );
  164. const startPTS = this.getVideoStartPts(videoTrack.samples);
  165. videoTrack.samples = videoTrack.samples.slice(firstKeyFrameIndex);
  166. videoTrack.dropped += firstKeyFrameIndex;
  167. videoTimeOffset +=
  168. (videoTrack.samples[0].pts - startPTS) /
  169. (videoTrack.timescale || 90000);
  170. } else if (firstKeyFrameIndex === -1) {
  171. logger.warn(
  172. `[mp4-remuxer]: No keyframe found out of ${length} video samples`
  173. );
  174. independent = false;
  175. }
  176. }
  177. }
  178.  
  179. if (this.ISGenerated) {
  180. if (enoughAudioSamples && enoughVideoSamples) {
  181. // timeOffset is expected to be the offset of the first timestamp of this fragment (first DTS)
  182. // if first audio DTS is not aligned with first video DTS then we need to take that into account
  183. // when providing timeOffset to remuxAudio / remuxVideo. if we don't do that, there might be a permanent / small
  184. // drift between audio and video streams
  185. const startPTS = this.getVideoStartPts(videoTrack.samples);
  186. const tsDelta =
  187. normalizePts(audioTrack.samples[0].pts, startPTS) - startPTS;
  188. const audiovideoTimestampDelta = tsDelta / videoTrack.inputTimeScale;
  189. audioTimeOffset += Math.max(0, audiovideoTimestampDelta);
  190. videoTimeOffset += Math.max(0, -audiovideoTimestampDelta);
  191. }
  192.  
  193. // Purposefully remuxing audio before video, so that remuxVideo can use nextAudioPts, which is calculated in remuxAudio.
  194. if (enoughAudioSamples) {
  195. // if initSegment was generated without audio samples, regenerate it again
  196. if (!audioTrack.samplerate) {
  197. logger.warn(
  198. '[mp4-remuxer]: regenerate InitSegment as audio detected'
  199. );
  200. initSegment = this.generateIS(audioTrack, videoTrack, timeOffset);
  201. }
  202. audio = this.remuxAudio(
  203. audioTrack,
  204. audioTimeOffset,
  205. this.isAudioContiguous,
  206. accurateTimeOffset,
  207. hasVideo ||
  208. enoughVideoSamples ||
  209. playlistType === PlaylistLevelType.AUDIO
  210. ? videoTimeOffset
  211. : undefined
  212. );
  213. if (enoughVideoSamples) {
  214. const audioTrackLength = audio ? audio.endPTS - audio.startPTS : 0;
  215. // if initSegment was generated without video samples, regenerate it again
  216. if (!videoTrack.inputTimeScale) {
  217. logger.warn(
  218. '[mp4-remuxer]: regenerate InitSegment as video detected'
  219. );
  220. initSegment = this.generateIS(audioTrack, videoTrack, timeOffset);
  221. }
  222. video = this.remuxVideo(
  223. videoTrack,
  224. videoTimeOffset,
  225. isVideoContiguous,
  226. audioTrackLength
  227. );
  228. }
  229. } else if (enoughVideoSamples) {
  230. video = this.remuxVideo(
  231. videoTrack,
  232. videoTimeOffset,
  233. isVideoContiguous,
  234. 0
  235. );
  236. }
  237. if (video) {
  238. video.firstKeyFrame = firstKeyFrameIndex;
  239. video.independent = firstKeyFrameIndex !== -1;
  240. }
  241. }
  242. }
  243.  
  244. // Allow ID3 and text to remux, even if more audio/video samples are required
  245. if (this.ISGenerated) {
  246. if (id3Track.samples.length) {
  247. id3 = flushTextTrackMetadataCueSamples(
  248. id3Track,
  249. timeOffset,
  250. this._initPTS,
  251. this._initDTS
  252. );
  253. }
  254.  
  255. if (textTrack.samples.length) {
  256. text = flushTextTrackUserdataCueSamples(
  257. textTrack,
  258. timeOffset,
  259. this._initPTS
  260. );
  261. }
  262. }
  263.  
  264. return {
  265. audio,
  266. video,
  267. initSegment,
  268. independent,
  269. text,
  270. id3,
  271. };
  272. }
  273.  
  274. generateIS(
  275. audioTrack: DemuxedAudioTrack,
  276. videoTrack: DemuxedAvcTrack,
  277. timeOffset
  278. ): InitSegmentData | undefined {
  279. const audioSamples = audioTrack.samples;
  280. const videoSamples = videoTrack.samples;
  281. const typeSupported = this.typeSupported;
  282. const tracks: TrackSet = {};
  283. const computePTSDTS = !Number.isFinite(this._initPTS);
  284. let container = 'audio/mp4';
  285. let initPTS: number | undefined;
  286. let initDTS: number | undefined;
  287. let timescale: number | undefined;
  288.  
  289. if (computePTSDTS) {
  290. initPTS = initDTS = Infinity;
  291. }
  292.  
  293. if (audioTrack.config && audioSamples.length) {
  294. // let's use audio sampling rate as MP4 time scale.
  295. // rationale is that there is a integer nb of audio frames per audio sample (1024 for AAC)
  296. // using audio sampling rate here helps having an integer MP4 frame duration
  297. // this avoids potential rounding issue and AV sync issue
  298. audioTrack.timescale = audioTrack.samplerate;
  299. if (!audioTrack.isAAC) {
  300. if (typeSupported.mpeg) {
  301. // Chrome and Safari
  302. container = 'audio/mpeg';
  303. audioTrack.codec = '';
  304. } else if (typeSupported.mp3) {
  305. // Firefox
  306. audioTrack.codec = 'mp3';
  307. }
  308. }
  309. tracks.audio = {
  310. id: 'audio',
  311. container: container,
  312. codec: audioTrack.codec,
  313. initSegment:
  314. !audioTrack.isAAC && typeSupported.mpeg
  315. ? new Uint8Array(0)
  316. : MP4.initSegment([audioTrack]),
  317. metadata: {
  318. channelCount: audioTrack.channelCount,
  319. },
  320. };
  321. if (computePTSDTS) {
  322. timescale = audioTrack.inputTimeScale;
  323. // remember first PTS of this demuxing context. for audio, PTS = DTS
  324. initPTS = initDTS =
  325. audioSamples[0].pts - Math.round(timescale * timeOffset);
  326. }
  327. }
  328.  
  329. if (videoTrack.sps && videoTrack.pps && videoSamples.length) {
  330. // let's use input time scale as MP4 video timescale
  331. // we use input time scale straight away to avoid rounding issues on frame duration / cts computation
  332. videoTrack.timescale = videoTrack.inputTimeScale;
  333. tracks.video = {
  334. id: 'main',
  335. container: 'video/mp4',
  336. codec: videoTrack.codec,
  337. initSegment: MP4.initSegment([videoTrack]),
  338. metadata: {
  339. width: videoTrack.width,
  340. height: videoTrack.height,
  341. },
  342. };
  343. if (computePTSDTS) {
  344. timescale = videoTrack.inputTimeScale;
  345. const startPTS = this.getVideoStartPts(videoSamples);
  346. const startOffset = Math.round(timescale * timeOffset);
  347. initDTS = Math.min(
  348. initDTS as number,
  349. normalizePts(videoSamples[0].dts, startPTS) - startOffset
  350. );
  351. initPTS = Math.min(initPTS as number, startPTS - startOffset);
  352. }
  353. }
  354.  
  355. if (Object.keys(tracks).length) {
  356. this.ISGenerated = true;
  357. if (computePTSDTS) {
  358. this._initPTS = initPTS as number;
  359. this._initDTS = initDTS as number;
  360. }
  361.  
  362. return {
  363. tracks,
  364. initPTS,
  365. timescale,
  366. };
  367. }
  368. }
  369.  
  370. remuxVideo(
  371. track: DemuxedAvcTrack,
  372. timeOffset: number,
  373. contiguous: boolean,
  374. audioTrackLength: number
  375. ): RemuxedTrack | undefined {
  376. const timeScale: number = track.inputTimeScale;
  377. const inputSamples: Array<AvcSample> = track.samples;
  378. const outputSamples: Array<Mp4Sample> = [];
  379. const nbSamples: number = inputSamples.length;
  380. const initPTS: number = this._initPTS;
  381. let nextAvcDts = this.nextAvcDts;
  382. let offset = 8;
  383. let mp4SampleDuration!: number;
  384. let firstDTS;
  385. let lastDTS;
  386. let minPTS: number = Number.POSITIVE_INFINITY;
  387. let maxPTS: number = Number.NEGATIVE_INFINITY;
  388. let ptsDtsShift = 0;
  389. let sortSamples = false;
  390.  
  391. // if parsed fragment is contiguous with last one, let's use last DTS value as reference
  392. if (!contiguous || nextAvcDts === null) {
  393. const pts = timeOffset * timeScale;
  394. const cts =
  395. inputSamples[0].pts -
  396. normalizePts(inputSamples[0].dts, inputSamples[0].pts);
  397. // if not contiguous, let's use target timeOffset
  398. nextAvcDts = pts - cts;
  399. }
  400.  
  401. // PTS is coded on 33bits, and can loop from -2^32 to 2^32
  402. // PTSNormalize will make PTS/DTS value monotonic, we use last known DTS value as reference value
  403. for (let i = 0; i < nbSamples; i++) {
  404. const sample = inputSamples[i];
  405. sample.pts = normalizePts(sample.pts - initPTS, nextAvcDts);
  406. sample.dts = normalizePts(sample.dts - initPTS, nextAvcDts);
  407. if (sample.dts > sample.pts) {
  408. const PTS_DTS_SHIFT_TOLERANCE_90KHZ = 90000 * 0.2;
  409. ptsDtsShift = Math.max(
  410. Math.min(ptsDtsShift, sample.pts - sample.dts),
  411. -1 * PTS_DTS_SHIFT_TOLERANCE_90KHZ
  412. );
  413. }
  414. if (sample.dts < inputSamples[i > 0 ? i - 1 : i].dts) {
  415. sortSamples = true;
  416. }
  417. }
  418.  
  419. // sort video samples by DTS then PTS then demux id order
  420. if (sortSamples) {
  421. inputSamples.sort(function (a, b) {
  422. const deltadts = a.dts - b.dts;
  423. const deltapts = a.pts - b.pts;
  424. return deltadts || deltapts;
  425. });
  426. }
  427.  
  428. // Get first/last DTS
  429. firstDTS = inputSamples[0].dts;
  430. lastDTS = inputSamples[inputSamples.length - 1].dts;
  431.  
  432. // on Safari let's signal the same sample duration for all samples
  433. // sample duration (as expected by trun MP4 boxes), should be the delta between sample DTS
  434. // set this constant duration as being the avg delta between consecutive DTS.
  435. const averageSampleDuration = Math.round(
  436. (lastDTS - firstDTS) / (nbSamples - 1)
  437. );
  438.  
  439. // handle broken streams with PTS < DTS, tolerance up 0.2 seconds
  440. if (ptsDtsShift < 0) {
  441. if (ptsDtsShift < averageSampleDuration * -2) {
  442. // Fix for "CNN special report, with CC" in test-streams (including Safari browser)
  443. // With large PTS < DTS errors such as this, we want to correct CTS while maintaining increasing DTS values
  444. logger.warn(
  445. `PTS < DTS detected in video samples, offsetting DTS from PTS by ${toMsFromMpegTsClock(
  446. -averageSampleDuration,
  447. true
  448. )} ms`
  449. );
  450. let lastDts = ptsDtsShift;
  451. for (let i = 0; i < nbSamples; i++) {
  452. inputSamples[i].dts = lastDts = Math.max(
  453. lastDts,
  454. inputSamples[i].pts - averageSampleDuration
  455. );
  456. inputSamples[i].pts = Math.max(lastDts, inputSamples[i].pts);
  457. }
  458. } else {
  459. // Fix for "Custom IV with bad PTS DTS" in test-streams
  460. // With smaller PTS < DTS errors we can simply move all DTS back. This increases CTS without causing buffer gaps or decode errors in Safari
  461. logger.warn(
  462. `PTS < DTS detected in video samples, shifting DTS by ${toMsFromMpegTsClock(
  463. ptsDtsShift,
  464. true
  465. )} ms to overcome this issue`
  466. );
  467. for (let i = 0; i < nbSamples; i++) {
  468. inputSamples[i].dts = inputSamples[i].dts + ptsDtsShift;
  469. }
  470. }
  471. firstDTS = inputSamples[0].dts;
  472. }
  473.  
  474. // if fragment are contiguous, detect hole/overlapping between fragments
  475. if (contiguous) {
  476. // check timestamp continuity across consecutive fragments (this is to remove inter-fragment gap/hole)
  477. const delta = firstDTS - nextAvcDts;
  478. const foundHole = delta > averageSampleDuration;
  479. const foundOverlap = delta < -1;
  480. if (foundHole || foundOverlap) {
  481. if (foundHole) {
  482. logger.warn(
  483. `AVC: ${toMsFromMpegTsClock(
  484. delta,
  485. true
  486. )} ms (${delta}dts) hole between fragments detected, filling it`
  487. );
  488. } else {
  489. logger.warn(
  490. `AVC: ${toMsFromMpegTsClock(
  491. -delta,
  492. true
  493. )} ms (${delta}dts) overlapping between fragments detected`
  494. );
  495. }
  496. firstDTS = nextAvcDts;
  497. const firstPTS = inputSamples[0].pts - delta;
  498. inputSamples[0].dts = firstDTS;
  499. inputSamples[0].pts = firstPTS;
  500. logger.log(
  501. `Video: First PTS/DTS adjusted: ${toMsFromMpegTsClock(
  502. firstPTS,
  503. true
  504. )}/${toMsFromMpegTsClock(
  505. firstDTS,
  506. true
  507. )}, delta: ${toMsFromMpegTsClock(delta, true)} ms`
  508. );
  509. }
  510. }
  511.  
  512. if (requiresPositiveDts) {
  513. firstDTS = Math.max(0, firstDTS);
  514. }
  515. let nbNalu = 0;
  516. let naluLen = 0;
  517. for (let i = 0; i < nbSamples; i++) {
  518. // compute total/avc sample length and nb of NAL units
  519. const sample = inputSamples[i];
  520. const units = sample.units;
  521. const nbUnits = units.length;
  522. let sampleLen = 0;
  523. for (let j = 0; j < nbUnits; j++) {
  524. sampleLen += units[j].data.length;
  525. }
  526.  
  527. naluLen += sampleLen;
  528. nbNalu += nbUnits;
  529. sample.length = sampleLen;
  530.  
  531. // normalize PTS/DTS
  532. // ensure sample monotonic DTS
  533. sample.dts = Math.max(sample.dts, firstDTS);
  534. // ensure that computed value is greater or equal than sample DTS
  535. sample.pts = Math.max(sample.pts, sample.dts, 0);
  536. minPTS = Math.min(sample.pts, minPTS);
  537. maxPTS = Math.max(sample.pts, maxPTS);
  538. }
  539. lastDTS = inputSamples[nbSamples - 1].dts;
  540.  
  541. /* concatenate the video data and construct the mdat in place
  542. (need 8 more bytes to fill length and mpdat type) */
  543. const mdatSize = naluLen + 4 * nbNalu + 8;
  544. let mdat;
  545. try {
  546. mdat = new Uint8Array(mdatSize);
  547. } catch (err) {
  548. this.observer.emit(Events.ERROR, Events.ERROR, {
  549. type: ErrorTypes.MUX_ERROR,
  550. details: ErrorDetails.REMUX_ALLOC_ERROR,
  551. fatal: false,
  552. bytes: mdatSize,
  553. reason: `fail allocating video mdat ${mdatSize}`,
  554. });
  555. return;
  556. }
  557. const view = new DataView(mdat.buffer);
  558. view.setUint32(0, mdatSize);
  559. mdat.set(MP4.types.mdat, 4);
  560.  
  561. for (let i = 0; i < nbSamples; i++) {
  562. const avcSample = inputSamples[i];
  563. const avcSampleUnits = avcSample.units;
  564. let mp4SampleLength = 0;
  565. // convert NALU bitstream to MP4 format (prepend NALU with size field)
  566. for (let j = 0, nbUnits = avcSampleUnits.length; j < nbUnits; j++) {
  567. const unit = avcSampleUnits[j];
  568. const unitData = unit.data;
  569. const unitDataLen = unit.data.byteLength;
  570. view.setUint32(offset, unitDataLen);
  571. offset += 4;
  572. mdat.set(unitData, offset);
  573. offset += unitDataLen;
  574. mp4SampleLength += 4 + unitDataLen;
  575. }
  576.  
  577. // expected sample duration is the Decoding Timestamp diff of consecutive samples
  578. if (i < nbSamples - 1) {
  579. mp4SampleDuration = inputSamples[i + 1].dts - avcSample.dts;
  580. } else {
  581. const config = this.config;
  582. const lastFrameDuration =
  583. avcSample.dts - inputSamples[i > 0 ? i - 1 : i].dts;
  584. if (config.stretchShortVideoTrack && this.nextAudioPts !== null) {
  585. // In some cases, a segment's audio track duration may exceed the video track duration.
  586. // Since we've already remuxed audio, and we know how long the audio track is, we look to
  587. // see if the delta to the next segment is longer than maxBufferHole.
  588. // If so, playback would potentially get stuck, so we artificially inflate
  589. // the duration of the last frame to minimize any potential gap between segments.
  590. const gapTolerance = Math.floor(config.maxBufferHole * timeScale);
  591. const deltaToFrameEnd =
  592. (audioTrackLength
  593. ? minPTS + audioTrackLength * timeScale
  594. : this.nextAudioPts) - avcSample.pts;
  595. if (deltaToFrameEnd > gapTolerance) {
  596. // We subtract lastFrameDuration from deltaToFrameEnd to try to prevent any video
  597. // frame overlap. maxBufferHole should be >> lastFrameDuration anyway.
  598. mp4SampleDuration = deltaToFrameEnd - lastFrameDuration;
  599. if (mp4SampleDuration < 0) {
  600. mp4SampleDuration = lastFrameDuration;
  601. }
  602. logger.log(
  603. `[mp4-remuxer]: It is approximately ${
  604. deltaToFrameEnd / 90
  605. } ms to the next segment; using duration ${
  606. mp4SampleDuration / 90
  607. } ms for the last video frame.`
  608. );
  609. } else {
  610. mp4SampleDuration = lastFrameDuration;
  611. }
  612. } else {
  613. mp4SampleDuration = lastFrameDuration;
  614. }
  615. }
  616. const compositionTimeOffset = Math.round(avcSample.pts - avcSample.dts);
  617.  
  618. outputSamples.push(
  619. new Mp4Sample(
  620. avcSample.key,
  621. mp4SampleDuration,
  622. mp4SampleLength,
  623. compositionTimeOffset
  624. )
  625. );
  626. }
  627.  
  628. if (outputSamples.length && chromeVersion && chromeVersion < 70) {
  629. // Chrome workaround, mark first sample as being a Random Access Point (keyframe) to avoid sourcebuffer append issue
  630. // https://code.google.com/p/chromium/issues/detail?id=229412
  631. const flags = outputSamples[0].flags;
  632. flags.dependsOn = 2;
  633. flags.isNonSync = 0;
  634. }
  635.  
  636. console.assert(
  637. mp4SampleDuration !== undefined,
  638. 'mp4SampleDuration must be computed'
  639. );
  640. // next AVC sample DTS should be equal to last sample DTS + last sample duration (in PES timescale)
  641. this.nextAvcDts = nextAvcDts = lastDTS + mp4SampleDuration;
  642. this.isVideoContiguous = true;
  643. const moof = MP4.moof(
  644. track.sequenceNumber++,
  645. firstDTS,
  646. Object.assign({}, track, {
  647. samples: outputSamples,
  648. })
  649. );
  650. const type: SourceBufferName = 'video';
  651. const data = {
  652. data1: moof,
  653. data2: mdat,
  654. startPTS: minPTS / timeScale,
  655. endPTS: (maxPTS + mp4SampleDuration) / timeScale,
  656. startDTS: firstDTS / timeScale,
  657. endDTS: (nextAvcDts as number) / timeScale,
  658. type,
  659. hasAudio: false,
  660. hasVideo: true,
  661. nb: outputSamples.length,
  662. dropped: track.dropped,
  663. };
  664.  
  665. track.samples = [];
  666. track.dropped = 0;
  667.  
  668. console.assert(mdat.length, 'MDAT length must not be zero');
  669.  
  670. return data;
  671. }
  672.  
  673. remuxAudio(
  674. track: DemuxedAudioTrack,
  675. timeOffset: number,
  676. contiguous: boolean,
  677. accurateTimeOffset: boolean,
  678. videoTimeOffset?: number
  679. ): RemuxedTrack | undefined {
  680. const inputTimeScale: number = track.inputTimeScale;
  681. const mp4timeScale: number = track.samplerate
  682. ? track.samplerate
  683. : inputTimeScale;
  684. const scaleFactor: number = inputTimeScale / mp4timeScale;
  685. const mp4SampleDuration: number = track.isAAC
  686. ? AAC_SAMPLES_PER_FRAME
  687. : MPEG_AUDIO_SAMPLE_PER_FRAME;
  688. const inputSampleDuration: number = mp4SampleDuration * scaleFactor;
  689. const initPTS: number = this._initPTS;
  690. const rawMPEG: boolean = !track.isAAC && this.typeSupported.mpeg;
  691. const outputSamples: Array<Mp4Sample> = [];
  692.  
  693. let inputSamples: Array<AudioSample> = track.samples;
  694. let offset: number = rawMPEG ? 0 : 8;
  695. let nextAudioPts: number = this.nextAudioPts || -1;
  696.  
  697. // window.audioSamples ? window.audioSamples.push(inputSamples.map(s => s.pts)) : (window.audioSamples = [inputSamples.map(s => s.pts)]);
  698.  
  699. // for audio samples, also consider consecutive fragments as being contiguous (even if a level switch occurs),
  700. // for sake of clarity:
  701. // consecutive fragments are frags with
  702. // - less than 100ms gaps between new time offset (if accurate) and next expected PTS OR
  703. // - less than 20 audio frames distance
  704. // contiguous fragments are consecutive fragments from same quality level (same level, new SN = old SN + 1)
  705. // this helps ensuring audio continuity
  706. // and this also avoids audio glitches/cut when switching quality, or reporting wrong duration on first audio frame
  707. const timeOffsetMpegTS = timeOffset * inputTimeScale;
  708. this.isAudioContiguous = contiguous =
  709. contiguous ||
  710. ((inputSamples.length &&
  711. nextAudioPts > 0 &&
  712. ((accurateTimeOffset &&
  713. Math.abs(timeOffsetMpegTS - nextAudioPts) < 9000) ||
  714. Math.abs(
  715. normalizePts(inputSamples[0].pts - initPTS, timeOffsetMpegTS) -
  716. nextAudioPts
  717. ) <
  718. 20 * inputSampleDuration)) as boolean);
  719.  
  720. // compute normalized PTS
  721. inputSamples.forEach(function (sample) {
  722. sample.pts = normalizePts(sample.pts - initPTS, timeOffsetMpegTS);
  723. });
  724.  
  725. if (!contiguous || nextAudioPts < 0) {
  726. // filter out sample with negative PTS that are not playable anyway
  727. // if we don't remove these negative samples, they will shift all audio samples forward.
  728. // leading to audio overlap between current / next fragment
  729. inputSamples = inputSamples.filter((sample) => sample.pts >= 0);
  730.  
  731. // in case all samples have negative PTS, and have been filtered out, return now
  732. if (!inputSamples.length) {
  733. return;
  734. }
  735.  
  736. if (videoTimeOffset === 0) {
  737. // Set the start to 0 to match video so that start gaps larger than inputSampleDuration are filled with silence
  738. nextAudioPts = 0;
  739. } else if (accurateTimeOffset) {
  740. // When not seeking, not live, and LevelDetails.PTSKnown, use fragment start as predicted next audio PTS
  741. nextAudioPts = Math.max(0, timeOffsetMpegTS);
  742. } else {
  743. // if frags are not contiguous and if we cant trust time offset, let's use first sample PTS as next audio PTS
  744. nextAudioPts = inputSamples[0].pts;
  745. }
  746. }
  747.  
  748. // If the audio track is missing samples, the frames seem to get "left-shifted" within the
  749. // resulting mp4 segment, causing sync issues and leaving gaps at the end of the audio segment.
  750. // In an effort to prevent this from happening, we inject frames here where there are gaps.
  751. // When possible, we inject a silent frame; when that's not possible, we duplicate the last
  752. // frame.
  753.  
  754. if (track.isAAC) {
  755. const alignedWithVideo = videoTimeOffset !== undefined;
  756. const maxAudioFramesDrift = this.config.maxAudioFramesDrift;
  757. for (let i = 0, nextPts = nextAudioPts; i < inputSamples.length; i++) {
  758. // First, let's see how far off this frame is from where we expect it to be
  759. const sample = inputSamples[i];
  760. const pts = sample.pts;
  761. const delta = pts - nextPts;
  762. const duration = Math.abs((1000 * delta) / inputTimeScale);
  763.  
  764. // When remuxing with video, if we're overlapping by more than a duration, drop this sample to stay in sync
  765. if (
  766. delta <= -maxAudioFramesDrift * inputSampleDuration &&
  767. alignedWithVideo
  768. ) {
  769. if (i === 0) {
  770. logger.warn(
  771. `Audio frame @ ${(pts / inputTimeScale).toFixed(
  772. 3
  773. )}s overlaps nextAudioPts by ${Math.round(
  774. (1000 * delta) / inputTimeScale
  775. )} ms.`
  776. );
  777. this.nextAudioPts = nextAudioPts = nextPts = pts;
  778. }
  779. } // eslint-disable-line brace-style
  780.  
  781. // Insert missing frames if:
  782. // 1: We're more than maxAudioFramesDrift frame away
  783. // 2: Not more than MAX_SILENT_FRAME_DURATION away
  784. // 3: currentTime (aka nextPtsNorm) is not 0
  785. // 4: remuxing with video (videoTimeOffset !== undefined)
  786. else if (
  787. delta >= maxAudioFramesDrift * inputSampleDuration &&
  788. duration < MAX_SILENT_FRAME_DURATION &&
  789. alignedWithVideo
  790. ) {
  791. let missing = Math.round(delta / inputSampleDuration);
  792. // Adjust nextPts so that silent samples are aligned with media pts. This will prevent media samples from
  793. // later being shifted if nextPts is based on timeOffset and delta is not a multiple of inputSampleDuration.
  794. nextPts = pts - missing * inputSampleDuration;
  795. if (nextPts < 0) {
  796. missing--;
  797. nextPts += inputSampleDuration;
  798. }
  799. if (i === 0) {
  800. this.nextAudioPts = nextAudioPts = nextPts;
  801. }
  802. logger.warn(
  803. `[mp4-remuxer]: Injecting ${missing} audio frame @ ${(
  804. nextPts / inputTimeScale
  805. ).toFixed(3)}s due to ${Math.round(
  806. (1000 * delta) / inputTimeScale
  807. )} ms gap.`
  808. );
  809. for (let j = 0; j < missing; j++) {
  810. const newStamp = Math.max(nextPts as number, 0);
  811. let fillFrame = AAC.getSilentFrame(
  812. track.manifestCodec || track.codec,
  813. track.channelCount
  814. );
  815. if (!fillFrame) {
  816. logger.log(
  817. '[mp4-remuxer]: Unable to get silent frame for given audio codec; duplicating last frame instead.'
  818. );
  819. fillFrame = sample.unit.subarray();
  820. }
  821. inputSamples.splice(i, 0, {
  822. unit: fillFrame,
  823. pts: newStamp,
  824. });
  825. nextPts += inputSampleDuration;
  826. i++;
  827. }
  828. }
  829. sample.pts = nextPts;
  830. nextPts += inputSampleDuration;
  831. }
  832. }
  833. let firstPTS: number | null = null;
  834. let lastPTS: number | null = null;
  835. let mdat: any;
  836. let mdatSize: number = 0;
  837. let sampleLength: number = inputSamples.length;
  838. while (sampleLength--) {
  839. mdatSize += inputSamples[sampleLength].unit.byteLength;
  840. }
  841. for (let j = 0, nbSamples = inputSamples.length; j < nbSamples; j++) {
  842. const audioSample = inputSamples[j];
  843. const unit = audioSample.unit;
  844. let pts = audioSample.pts;
  845. if (lastPTS !== null) {
  846. // If we have more than one sample, set the duration of the sample to the "real" duration; the PTS diff with
  847. // the previous sample
  848. const prevSample = outputSamples[j - 1];
  849. prevSample.duration = Math.round((pts - lastPTS) / scaleFactor);
  850. } else {
  851. if (contiguous && track.isAAC) {
  852. // set PTS/DTS to expected PTS/DTS
  853. pts = nextAudioPts;
  854. }
  855. // remember first PTS of our audioSamples
  856. firstPTS = pts;
  857. if (mdatSize > 0) {
  858. /* concatenate the audio data and construct the mdat in place
  859. (need 8 more bytes to fill length and mdat type) */
  860. mdatSize += offset;
  861. try {
  862. mdat = new Uint8Array(mdatSize);
  863. } catch (err) {
  864. this.observer.emit(Events.ERROR, Events.ERROR, {
  865. type: ErrorTypes.MUX_ERROR,
  866. details: ErrorDetails.REMUX_ALLOC_ERROR,
  867. fatal: false,
  868. bytes: mdatSize,
  869. reason: `fail allocating audio mdat ${mdatSize}`,
  870. });
  871. return;
  872. }
  873. if (!rawMPEG) {
  874. const view = new DataView(mdat.buffer);
  875. view.setUint32(0, mdatSize);
  876. mdat.set(MP4.types.mdat, 4);
  877. }
  878. } else {
  879. // no audio samples
  880. return;
  881. }
  882. }
  883. mdat.set(unit, offset);
  884. const unitLen = unit.byteLength;
  885. offset += unitLen;
  886. // Default the sample's duration to the computed mp4SampleDuration, which will either be 1024 for AAC or 1152 for MPEG
  887. // In the case that we have 1 sample, this will be the duration. If we have more than one sample, the duration
  888. // becomes the PTS diff with the previous sample
  889. outputSamples.push(new Mp4Sample(true, mp4SampleDuration, unitLen, 0));
  890. lastPTS = pts;
  891. }
  892.  
  893. // We could end up with no audio samples if all input samples were overlapping with the previously remuxed ones
  894. const nbSamples = outputSamples.length;
  895. if (!nbSamples) {
  896. return;
  897. }
  898.  
  899. // The next audio sample PTS should be equal to last sample PTS + duration
  900. const lastSample = outputSamples[outputSamples.length - 1];
  901. this.nextAudioPts = nextAudioPts =
  902. lastPTS! + scaleFactor * lastSample.duration;
  903.  
  904. // Set the track samples from inputSamples to outputSamples before remuxing
  905. const moof = rawMPEG
  906. ? new Uint8Array(0)
  907. : MP4.moof(
  908. track.sequenceNumber++,
  909. firstPTS! / scaleFactor,
  910. Object.assign({}, track, { samples: outputSamples })
  911. );
  912.  
  913. // Clear the track samples. This also clears the samples array in the demuxer, since the reference is shared
  914. track.samples = [];
  915. const start = firstPTS! / inputTimeScale;
  916. const end = nextAudioPts / inputTimeScale;
  917. const type: SourceBufferName = 'audio';
  918. const audioData = {
  919. data1: moof,
  920. data2: mdat,
  921. startPTS: start,
  922. endPTS: end,
  923. startDTS: start,
  924. endDTS: end,
  925. type,
  926. hasAudio: true,
  927. hasVideo: false,
  928. nb: nbSamples,
  929. };
  930.  
  931. this.isAudioContiguous = true;
  932.  
  933. console.assert(mdat.length, 'MDAT length must not be zero');
  934. return audioData;
  935. }
  936.  
  937. remuxEmptyAudio(
  938. track: DemuxedAudioTrack,
  939. timeOffset: number,
  940. contiguous: boolean,
  941. videoData: Fragment
  942. ): RemuxedTrack | undefined {
  943. const inputTimeScale: number = track.inputTimeScale;
  944. const mp4timeScale: number = track.samplerate
  945. ? track.samplerate
  946. : inputTimeScale;
  947. const scaleFactor: number = inputTimeScale / mp4timeScale;
  948. const nextAudioPts: number | null = this.nextAudioPts;
  949. // sync with video's timestamp
  950. const startDTS: number =
  951. (nextAudioPts !== null
  952. ? nextAudioPts
  953. : videoData.startDTS * inputTimeScale) + this._initDTS;
  954. const endDTS: number = videoData.endDTS * inputTimeScale + this._initDTS;
  955. // one sample's duration value
  956. const frameDuration: number = scaleFactor * AAC_SAMPLES_PER_FRAME;
  957. // samples count of this segment's duration
  958. const nbSamples: number = Math.ceil((endDTS - startDTS) / frameDuration);
  959. // silent frame
  960. const silentFrame: Uint8Array | undefined = AAC.getSilentFrame(
  961. track.manifestCodec || track.codec,
  962. track.channelCount
  963. );
  964.  
  965. logger.warn('[mp4-remuxer]: remux empty Audio');
  966. // Can't remux if we can't generate a silent frame...
  967. if (!silentFrame) {
  968. logger.trace(
  969. '[mp4-remuxer]: Unable to remuxEmptyAudio since we were unable to get a silent frame for given audio codec'
  970. );
  971. return;
  972. }
  973.  
  974. const samples: Array<any> = [];
  975. for (let i = 0; i < nbSamples; i++) {
  976. const stamp = startDTS + i * frameDuration;
  977. samples.push({ unit: silentFrame, pts: stamp, dts: stamp });
  978. }
  979. track.samples = samples;
  980.  
  981. return this.remuxAudio(track, timeOffset, contiguous, false);
  982. }
  983. }
  984.  
  985. export function normalizePts(value: number, reference: number | null): number {
  986. let offset;
  987. if (reference === null) {
  988. return value;
  989. }
  990.  
  991. if (reference < value) {
  992. // - 2^33
  993. offset = -8589934592;
  994. } else {
  995. // + 2^33
  996. offset = 8589934592;
  997. }
  998. /* PTS is 33bit (from 0 to 2^33 -1)
  999. if diff between value and reference is bigger than half of the amplitude (2^32) then it means that
  1000. PTS looping occured. fill the gap */
  1001. while (Math.abs(value - reference) > 4294967296) {
  1002. value += offset;
  1003. }
  1004.  
  1005. return value;
  1006. }
  1007.  
  1008. function findKeyframeIndex(samples: Array<AvcSample>): number {
  1009. for (let i = 0; i < samples.length; i++) {
  1010. if (samples[i].key) {
  1011. return i;
  1012. }
  1013. }
  1014. return -1;
  1015. }
  1016.  
  1017. export function flushTextTrackMetadataCueSamples(
  1018. track: DemuxedMetadataTrack,
  1019. timeOffset: number,
  1020. initPTS: number,
  1021. initDTS: number
  1022. ): RemuxedMetadata | undefined {
  1023. const length = track.samples.length;
  1024. if (!length) {
  1025. return;
  1026. }
  1027. const inputTimeScale = track.inputTimeScale;
  1028. for (let index = 0; index < length; index++) {
  1029. const sample = track.samples[index];
  1030. // setting id3 pts, dts to relative time
  1031. // using this._initPTS and this._initDTS to calculate relative time
  1032. sample.pts =
  1033. normalizePts(sample.pts - initPTS, timeOffset * inputTimeScale) /
  1034. inputTimeScale;
  1035. sample.dts =
  1036. normalizePts(sample.dts - initDTS, timeOffset * inputTimeScale) /
  1037. inputTimeScale;
  1038. }
  1039. const samples = track.samples;
  1040. track.samples = [];
  1041. return {
  1042. samples,
  1043. };
  1044. }
  1045.  
  1046. export function flushTextTrackUserdataCueSamples(
  1047. track: DemuxedUserdataTrack,
  1048. timeOffset: number,
  1049. initPTS: number
  1050. ): RemuxedUserdata | undefined {
  1051. const length = track.samples.length;
  1052. if (!length) {
  1053. return;
  1054. }
  1055.  
  1056. const inputTimeScale = track.inputTimeScale;
  1057. for (let index = 0; index < length; index++) {
  1058. const sample = track.samples[index];
  1059. // setting text pts, dts to relative time
  1060. // using this._initPTS and this._initDTS to calculate relative time
  1061. sample.pts =
  1062. normalizePts(sample.pts - initPTS, timeOffset * inputTimeScale) /
  1063. inputTimeScale;
  1064. }
  1065. track.samples.sort((a, b) => a.pts - b.pts);
  1066. const samples = track.samples;
  1067. track.samples = [];
  1068. return {
  1069. samples,
  1070. };
  1071. }
  1072.  
  1073. class Mp4Sample {
  1074. public size: number;
  1075. public duration: number;
  1076. public cts: number;
  1077. public flags: Mp4SampleFlags;
  1078.  
  1079. constructor(isKeyframe: boolean, duration, size, cts) {
  1080. this.duration = duration;
  1081. this.size = size;
  1082. this.cts = cts;
  1083. this.flags = new Mp4SampleFlags(isKeyframe);
  1084. }
  1085. }
  1086.  
  1087. class Mp4SampleFlags {
  1088. public isLeading: 0 = 0;
  1089. public isDependedOn: 0 = 0;
  1090. public hasRedundancy: 0 = 0;
  1091. public degradPrio: 0 = 0;
  1092. public dependsOn: 1 | 2 = 1;
  1093. public isNonSync: 0 | 1 = 1;
  1094.  
  1095. constructor(isKeyframe) {
  1096. this.dependsOn = isKeyframe ? 2 : 1;
  1097. this.isNonSync = isKeyframe ? 0 : 1;
  1098. }
  1099. }