diff --git a/test/ref/repack.txt b/test/ref/repack.txt
new file mode 100644
index 0000000000..3946a6eb47
--- /dev/null
+++ b/test/ref/repack.txt
@@ -0,0 +1,163 @@
+0bgr            => [pa] [un] gbrp            | a=1:1 [tu] [tp]
+0rgb            => [pa] [un] gbrp            | a=1:1 [tu] [tp]
+abgr            => [pa] [un] gbrap           | a=1:1 [tu] [tp]
+argb            => [pa] [un] gbrap           | a=1:1 [tu] [tp]
+ayuv64          => [pa] [un] yuva444p16      | a=1:1 [tu] [tp]
+ayuv64be        => [pa] [un] yuva444p16      | a=1:1 [tu] [tp]
+bayer_bggr16    => no
+bayer_bggr16be  => no
+bayer_bggr8     => no
+bayer_gbrg16    => no
+bayer_gbrg16be  => no
+bayer_gbrg8     => no
+bayer_grbg16    => no
+bayer_grbg16be  => no
+bayer_grbg8     => no
+bayer_rggb16    => no
+bayer_rggb16be  => no
+bayer_rggb8     => no
+bgr0            => [pa] [un] gbrp            | a=1:1 [tu] [tp]
+bgr24           => [pa] [un] gbrp            | a=1:1
+bgr4            => no
+bgr444          => [pa] [un] gbrp4           | a=1:1
+bgr444          => [pa] [un] gbrp            | a=1:1 [expand-8bit]
+bgr444be        => [pa] [un] gbrp4           | a=1:1
+bgr444be        => [pa] [un] gbrp            | a=1:1 [expand-8bit]
+bgr48           => [pa] [un] gbrp16          | a=1:1
+bgr48be         => [pa] [un] gbrp16          | a=1:1
+bgr4_byte       => [pa] [un] gbrp2           | a=1:1
+bgr4_byte       => [pa] [un] gbrp1           | a=1:1 [round-down]
+bgr4_byte       => [pa] [un] gbrp            | a=1:1 [expand-8bit]
+bgr555          => [pa] [un] gbrp5           | a=1:1
+bgr555          => [pa] [un] gbrp            | a=1:1 [expand-8bit]
+bgr555be        => [pa] [un] gbrp5           | a=1:1
+bgr555be        => [pa] [un] gbrp            | a=1:1 [expand-8bit]
+bgr565          => [pa] [un] gbrp6           | a=1:1
+bgr565          => [pa] [un] gbrp5           | a=1:1 [round-down]
+bgr565          => [pa] [un] gbrp            | a=1:1 [expand-8bit]
+bgr565be        => [pa] [un] gbrp6           | a=1:1
+bgr565be        => [pa] [un] gbrp5           | a=1:1 [round-down]
+bgr565be        => [pa] [un] gbrp            | a=1:1 [expand-8bit]
+bgr8            => [pa] [un] gbrp3           | a=1:1
+bgr8            => [pa] [un] gbrp2           | a=1:1 [round-down]
+bgr8            => [pa] [un] gbrp            | a=1:1 [expand-8bit]
+bgra            => [pa] [un] gbrap           | a=1:1 [tu] [tp]
+bgra64          => [pa] [un] gbrap16         | a=1:1
+bgra64be        => [pa] [un] gbrap16         | a=1:1
+cuda            => no
+d3d11           => no
+d3d11va_vld     => no
+drm_prime       => no
+dxva2_vld       => no
+gbrap10be       => [pa] [un] gbrap10         | a=1:1
+gbrap12be       => [pa] [un] gbrap12         | a=1:1
+gbrap16be       => [pa] [un] gbrap16         | a=1:1
+gbrapf32be      => [pa] [un] gbrapf32        | a=1:1
+gbrp10be        => [pa] [un] gbrp10          | a=1:1
+gbrp12be        => [pa] [un] gbrp12          | a=1:1
+gbrp14be        => [pa] [un] gbrp14          | a=1:1
+gbrp16be        => [pa] [un] gbrp16          | a=1:1
+gbrp9be         => [pa] [un] gbrp9           | a=1:1
+gbrpf32be       => [pa] [un] gbrpf32         | a=1:1
+gray10be        => [pa] [un] gray10          | a=1:1
+gray12be        => [pa] [un] gray12          | a=1:1
+gray14be        => [pa] [un] gray14          | a=1:1
+gray16be        => [pa] [un] gray16          | a=1:1
+gray9be         => [pa] [un] gray9           | a=1:1
+grayf32be       => [pa] [un] grayf32         | a=1:1
+mediacodec      => no
+mmal            => no
+monob           => [pa] [un] y1              | a=8:1 [tu] [tp]
+monob           => [pa] [un] gray            | a=8:1 [expand-8bit]
+monow           => [pa] [un] y1              | a=8:1 [tu] [tp]
+monow           => [pa] [un] gray            | a=8:1 [expand-8bit]
+nv12            => [pa] [un] yuv420p         | a=2:2 [tu] [tp]
+nv16            => [pa] [un] yuv422p         | a=2:1
+nv20            => [pa] [un] yuv422p10       | a=2:1
+nv20be          => [pa] [un] yuv422p10       | a=2:1
+nv21            => [pa] [un] yuv420p         | a=2:2 [tu] [tp]
+nv24            => [pa] [un] yuv444p         | a=1:1
+nv42            => [pa] [un] yuv444p         | a=1:1
+opencl          => no
+p010            => [pa] [un] yuv420p16       | a=2:2
+p010be          => [pa] [un] yuv420p16       | a=2:2
+p016            => [pa] [un] yuv420p16       | a=2:2
+p016be          => [pa] [un] yuv420p16       | a=2:2
+pal8            =>      [un] gbrap           | a=1:1
+qsv             => no
+rgb0            => [pa] [un] gbrp            | a=1:1 [tu] [tp]
+rgb24           => [pa] [un] gbrp            | a=1:1
+rgb30           => [pa] [un] gbrp10          | a=1:1
+rgb4            => no
+rgb444          => [pa] [un] gbrp4           | a=1:1
+rgb444          => [pa] [un] gbrp            | a=1:1 [expand-8bit]
+rgb444be        => [pa] [un] gbrp4           | a=1:1
+rgb444be        => [pa] [un] gbrp            | a=1:1 [expand-8bit]
+rgb48           => [pa] [un] gbrp16          | a=1:1
+rgb48be         => [pa] [un] gbrp16          | a=1:1 [tu] [tp]
+rgb4_byte       => [pa] [un] gbrp2           | a=1:1
+rgb4_byte       => [pa] [un] gbrp1           | a=1:1 [round-down]
+rgb4_byte       => [pa] [un] gbrp            | a=1:1 [expand-8bit]
+rgb555          => [pa] [un] gbrp5           | a=1:1
+rgb555          => [pa] [un] gbrp            | a=1:1 [expand-8bit]
+rgb555be        => [pa] [un] gbrp5           | a=1:1
+rgb555be        => [pa] [un] gbrp            | a=1:1 [expand-8bit]
+rgb565          => [pa] [un] gbrp6           | a=1:1
+rgb565          => [pa] [un] gbrp5           | a=1:1 [round-down]
+rgb565          => [pa] [un] gbrp            | a=1:1 [expand-8bit]
+rgb565be        => [pa] [un] gbrp6           | a=1:1
+rgb565be        => [pa] [un] gbrp5           | a=1:1 [round-down]
+rgb565be        => [pa] [un] gbrp            | a=1:1 [expand-8bit]
+rgb8            => [pa] [un] gbrp3           | a=1:1
+rgb8            => [pa] [un] gbrp2           | a=1:1 [round-down]
+rgb8            => [pa] [un] gbrp            | a=1:1 [expand-8bit]
+rgba            => [pa] [un] gbrap           | a=1:1 [tu] [tp]
+rgba64          => [pa] [un] gbrap16         | a=1:1 [tu] [tp]
+rgba64be        => [pa] [un] gbrap16         | a=1:1
+uyvy422         => [pa] [un] yuv422p         | a=2:1
+uyyvyy411       => no
+vaapi           => no
+vaapi_idct      => no
+vaapi_moco      => no
+vdpau           => no
+vdpau_output    => no
+videotoolbox    => no
+vulkan          => no
+xvmc            => no
+xyz12           => [pa] [un] gbrp16          | a=1:1
+xyz12be         => [pa] [un] gbrp16          | a=1:1
+y210            => [pa] [un] yuv422p16       | a=2:1
+y210be          => [pa] [un] yuv422p16       | a=2:1
+ya16            => [pa] [un] yap16           | a=1:1 [tu] [tp]
+ya16be          => [pa] [un] yap16           | a=1:1
+ya8             => [pa] [un] yap8            | a=1:1
+yuv420p10be     => [pa] [un] yuv420p10       | a=2:2
+yuv420p12be     => [pa] [un] yuv420p12       | a=2:2
+yuv420p14be     => [pa] [un] yuv420p14       | a=2:2
+yuv420p16be     => [pa] [un] yuv420p16       | a=2:2
+yuv420p9be      => [pa] [un] yuv420p9        | a=2:2
+yuv422p10be     => [pa] [un] yuv422p10       | a=2:1
+yuv422p12be     => [pa] [un] yuv422p12       | a=2:1
+yuv422p14be     => [pa] [un] yuv422p14       | a=2:1
+yuv422p16be     => [pa] [un] yuv422p16       | a=2:1 [tu] [tp]
+yuv422p9be      => [pa] [un] yuv422p9        | a=2:1
+yuv440p10be     => [pa] [un] yuv440p10       | a=1:2
+yuv440p12be     => [pa] [un] yuv440p12       | a=1:2
+yuv444p10be     => [pa] [un] yuv444p10       | a=1:1
+yuv444p12be     => [pa] [un] yuv444p12       | a=1:1
+yuv444p14be     => [pa] [un] yuv444p14       | a=1:1
+yuv444p16be     => [pa] [un] yuv444p16       | a=1:1
+yuv444p9be      => [pa] [un] yuv444p9        | a=1:1
+yuva420p10be    => [pa] [un] yuva420p10      | a=2:2
+yuva420p16be    => [pa] [un] yuva420p16      | a=2:2
+yuva420p9be     => [pa] [un] yuva420p9       | a=2:2
+yuva422p10be    => [pa] [un] yuva422p10      | a=2:1
+yuva422p12be    => [pa] [un] yuva422p12      | a=2:1
+yuva422p16be    => [pa] [un] yuva422p16      | a=2:1
+yuva422p9be     => [pa] [un] yuva422p9       | a=2:1
+yuva444p10be    => [pa] [un] yuva444p10      | a=1:1
+yuva444p12be    => [pa] [un] yuva444p12      | a=1:1
+yuva444p16be    => [pa] [un] yuva444p16      | a=1:1
+yuva444p9be     => [pa] [un] yuva444p9       | a=1:1
+yuyv422         => [pa] [un] yuv422p         | a=2:1
+yvyu422         => [pa] [un] yuv422p         | a=2:1 [tu] [tp]
diff --git a/test/repack.c b/test/repack.c
new file mode 100644
index 0000000000..ede6046350
--- /dev/null
+++ b/test/repack.c
@@ -0,0 +1,249 @@
+#include <libavutil/pixfmt.h>
+
+#include "common/common.h"
+#include "tests.h"
+#include "video/fmt-conversion.h"
+#include "video/img_format.h"
+#include "video/repack.h"
+#include "video/zimg.h"
+
+// Excuse the utter stupidity.
+#define UNFUCK(v) ((v) > 0 ? (v) : pixfmt2imgfmt(-(v)))
+static_assert(IMGFMT_START > 0, "");
+#define IMGFMT_GBRP (-AV_PIX_FMT_GBRP)
+#define IMGFMT_GBRAP (-AV_PIX_FMT_GBRAP)
+
+struct entry {
+    int w, h;
+    int fmt_a;
+    const void *const a[4];
+    int fmt_b;
+    const void *const b[4];
+    int flags;
+};
+
+#define P8(...) (const uint8_t[]){__VA_ARGS__}
+#define P16(...) (const uint16_t[]){__VA_ARGS__}
+
+// Warning: only entries that match existing conversions are tested.
+static const struct entry repack_tests[] = {
+    // Note: the '0' tests rely on 0 being written, although by definition the
+    //       contents of this padding is undefined. The repacker always writes
+    //       it this way, though.
+    {1, 1, IMGFMT_RGB0,             {P8(1, 2, 3, 0)},
+           IMGFMT_GBRP,             {P8(2), P8(3), P8(1)}},
+    {1, 1, IMGFMT_BGR0,             {P8(1, 2, 3, 0)},
+           IMGFMT_GBRP,             {P8(2), P8(1), P8(3)}},
+    {1, 1, IMGFMT_0RGB,             {P8(0, 1, 2, 3)},
+           IMGFMT_GBRP,             {P8(2), P8(3), P8(1)}},
+    {1, 1, IMGFMT_0BGR,             {P8(0, 1, 2, 3)},
+           IMGFMT_GBRP,             {P8(2), P8(1), P8(3)}},
+    {1, 1, IMGFMT_RGBA,             {P8(1, 2, 3, 4)},
+           IMGFMT_GBRAP,            {P8(2), P8(3), P8(1), P8(4)}},
+    {1, 1, IMGFMT_BGRA,             {P8(1, 2, 3, 4)},
+           IMGFMT_GBRAP,            {P8(2), P8(1), P8(3), P8(4)}},
+    {1, 1, IMGFMT_ARGB,             {P8(4, 1, 2, 3)},
+           IMGFMT_GBRAP,            {P8(2), P8(3), P8(1), P8(4)}},
+    {1, 1, IMGFMT_ABGR,             {P8(4, 1, 2, 3)},
+           IMGFMT_GBRAP,            {P8(2), P8(1), P8(3), P8(4)}},
+    {1, 1, IMGFMT_RGBA64,           {P16(0x1a1b, 0x2a2b, 0x3a3b, 0x4a4b)},
+           -AV_PIX_FMT_GBRAP16,     {P16(0x2a2b), P16(0x3a3b),
+                                     P16(0x1a1b), P16(0x4a4b)}},
+    {1, 1, -AV_PIX_FMT_RGB48BE,     {P16(0x1a1b, 0x2a2b, 0x3a3b)},
+           -AV_PIX_FMT_GBRP16,      {P16(0x2b2a), P16(0x3b3a),
+                                     P16(0x1b1a)}},
+    {8, 1, -AV_PIX_FMT_MONOWHITE,   {P8(0xAA)},
+           IMGFMT_Y1,               {P8(0, 1, 0, 1, 0, 1, 0, 1)}},
+    {8, 1, -AV_PIX_FMT_MONOBLACK,   {P8(0xAA)},
+           IMGFMT_Y1,               {P8(1, 0, 1, 0, 1, 0, 1, 0)}},
+    {2, 2, IMGFMT_NV12,             {P8(1, 2, 3, 4), P8(5, 6)},
+           IMGFMT_420P,             {P8(1, 2, 3, 4), P8(5), P8(6)}},
+    {2, 2, -AV_PIX_FMT_NV21,        {P8(1, 2, 3, 4), P8(5, 6)},
+           IMGFMT_420P,             {P8(1, 2, 3, 4), P8(6), P8(5)}},
+    {1, 1, -AV_PIX_FMT_AYUV64,      {P16(1, 2, 3, 4)},
+           -AV_PIX_FMT_YUVA444P16,  {P16(2), P16(3), P16(4), P16(1)}},
+    {1, 1, -AV_PIX_FMT_AYUV64BE,    {P16(0x0100, 0x0200, 0x0300, 0x0400)},
+           -AV_PIX_FMT_YUVA444P16,  {P16(2), P16(3), P16(4), P16(1)}},
+    {2, 1, -AV_PIX_FMT_YVYU422,     {P8(1, 2, 3, 4)},
+           -AV_PIX_FMT_YUV422P,     {P8(1, 3), P8(4), P8(2)}},
+    {1, 1, -AV_PIX_FMT_YA16,        {P16(1, 2)},
+           IMGFMT_YAP16,            {P16(1), P16(2)}},
+    {2, 1, -AV_PIX_FMT_YUV422P16BE, {P16(0x1a1b, 0x2a2b), P16(0x3a3b),
+                                     P16(0x4a4b)},
+           -AV_PIX_FMT_YUV422P16,   {P16(0x1b1a, 0x2b2a), P16(0x3b3a),
+                                     P16(0x4b4a)}},
+};
+
+static bool is_true_planar(int imgfmt)
+{
+    struct mp_regular_imgfmt desc;
+    if (!mp_get_regular_imgfmt(&desc, imgfmt))
+        return false;
+
+    for (int n = 0; n < desc.num_planes; n++) {
+        if (desc.planes[n].num_components != 1)
+            return false;
+    }
+
+    return true;
+}
+
+static int try_repack(struct test_ctx *ctx, FILE *f, int imgfmt, int flags,
+                      int not_if_fmt)
+{
+    char *head = mp_tprintf(80, "%-15s =>", mp_imgfmt_to_name(imgfmt));
+    struct mp_repack *un = mp_repack_create_planar(imgfmt, false, flags);
+    struct mp_repack *pa = mp_repack_create_planar(imgfmt, true, flags);
+
+    // If both exists, they must be always symmetric.
+    if (un && pa) {
+        assert(mp_repack_get_format_src(pa) == mp_repack_get_format_dst(un));
+        assert(mp_repack_get_format_src(un) == mp_repack_get_format_dst(pa));
+        assert(mp_repack_get_align_x(pa) == mp_repack_get_align_x(un));
+        assert(mp_repack_get_align_y(pa) == mp_repack_get_align_y(un));
+    }
+
+    int a = 0;
+    int b = 0;
+    if (un) {
+        a = mp_repack_get_format_src(un);
+        b = mp_repack_get_format_dst(un);
+    } else if (pa) {
+        a = mp_repack_get_format_dst(pa);
+        b = mp_repack_get_format_src(pa);
+    }
+
+    // Skip the identity ones because they're uninteresting, and add too much
+    // noise. But still make sure they behave as expected.
+    if (is_true_planar(imgfmt)) {
+        // (note that we require alpha-enabled zimg)
+        assert(mp_zimg_supports_in_format(imgfmt));
+        assert(un && pa);
+        assert(a == imgfmt && b == imgfmt);
+        talloc_free(pa);
+        talloc_free(un);
+        return 0;
+    }
+
+    struct mp_repack *rp = pa ? pa : un;
+    if (!rp) {
+        if (!flags)
+            fprintf(f, "%s no\n", head);
+        return 0;
+    }
+
+    assert(a == imgfmt);
+    if (b && b == not_if_fmt) {
+        talloc_free(pa);
+        talloc_free(un);
+        return 0;
+    }
+
+    fprintf(f, "%s %4s %4s %-15s |", head, pa ? "[pa]" : "", un ? "[un]" : "",
+            mp_imgfmt_to_name(b));
+
+    fprintf(f, " a=%d:%d", mp_repack_get_align_x(rp), mp_repack_get_align_y(rp));
+
+    if (flags & REPACK_CREATE_ROUND_DOWN)
+        fprintf(f, " [round-down]");
+    if (flags & REPACK_CREATE_EXPAND_8BIT)
+        fprintf(f, " [expand-8bit]");
+
+    // LCM of alignment of all packers.
+    int ax = mp_repack_get_align_x(rp);
+    int ay = mp_repack_get_align_y(rp);
+    if (pa && un) {
+        ax = MPMAX(mp_repack_get_align_x(pa), mp_repack_get_align_x(un));
+        ay = MPMAX(mp_repack_get_align_y(pa), mp_repack_get_align_y(un));
+    }
+
+    for (int n = 0; n < MP_ARRAY_SIZE(repack_tests); n++) {
+        const struct entry *e = &repack_tests[n];
+        int fmt_a = UNFUCK(e->fmt_a);
+        int fmt_b = UNFUCK(e->fmt_b);
+        if (!(fmt_a == a && fmt_b == b && e->flags == flags))
+            continue;
+
+        // We convert a "random" macro pixel to catch potential addressing bugs
+        // that might be ignored with (0, 0) origins.
+        struct mp_image *ia = mp_image_alloc(fmt_a, e->w * 5 * ax, e->h * 5 * ay);
+        struct mp_image *ib = mp_image_alloc(fmt_b, e->w * 7 * ax, e->h * 6 * ay);
+        int sx = 4 * ax, sy = 3 * ay, dx = 3 * ax, dy = 2 * ay;
+
+        assert(ia && ib);
+
+        for (int pack = 0; pack < 2; pack++) {
+            struct mp_repack *repacker = pack ? pa : un;
+            if (!repacker)
+                continue;
+
+            mp_image_clear(ia, 0, 0, ia->w, ia->h);
+            mp_image_clear(ib, 0, 0, ib->w, ib->h);
+
+            const void *const *dstd = pack ? e->a : e->b;
+            const void *const *srcd = pack ? e->b : e->a;
+            struct mp_image *dsti = pack ? ia : ib;
+            struct mp_image *srci = pack ? ib : ia;
+
+            bool r = repack_config_buffers(repacker, 0, dsti, 0, srci, NULL);
+            assert(r);
+
+            for (int p = 0; p < srci->num_planes; p++) {
+                uint8_t *ptr = mp_image_pixel_ptr(srci, p, sx, sy);
+                for (int y = 0; y < e->h >> srci->fmt.ys[p]; y++) {
+                    int w = e->w >> srci->fmt.xs[p];
+                    int wb = (w * srci->fmt.bpp[p] + 7) / 8;
+                    const void *cptr = (uint8_t *)srcd[p] + wb * y;
+                    memcpy(ptr + srci->stride[p] * y, cptr, wb);
+                }
+            }
+
+            repack_line(repacker, dx, dy, sx, sy, e->w);
+
+            for (int p = 0; p < dsti->num_planes; p++) {
+                uint8_t *ptr = mp_image_pixel_ptr(dsti, p, dx, dy);
+                for (int y = 0; y < e->h >> dsti->fmt.ys[p]; y++) {
+                    int w = e->w >> dsti->fmt.xs[p];
+                    int wb = (w * dsti->fmt.bpp[p] + 7) / 8;
+                    const void *cptr = (uint8_t *)dstd[p] + wb * y;
+                    assert_memcmp(ptr + dsti->stride[p] * y, cptr, wb);
+                }
+            }
+
+            fprintf(f, " [t%s]", pack ? "p" : "u");
+        }
+
+        talloc_free(ia);
+        talloc_free(ib);
+    }
+
+    fprintf(f, "\n");
+
+    talloc_free(pa);
+    talloc_free(un);
+    return b;
+}
+
+static void run(struct test_ctx *ctx)
+{
+    FILE *f = test_open_out(ctx, "repack.txt");
+
+    init_imgfmts_list();
+    for (int n = 0; n < num_imgfmts; n++) {
+        int imgfmt = imgfmts[n];
+
+        int other = try_repack(ctx, f, imgfmt, 0, 0);
+        try_repack(ctx, f, imgfmt, REPACK_CREATE_ROUND_DOWN, other);
+        try_repack(ctx, f, imgfmt, REPACK_CREATE_EXPAND_8BIT, other);
+    }
+
+    fclose(f);
+
+    assert_text_files_equal(ctx, "repack.txt", "repack.txt",
+                "This can fail if FFmpeg/libswscale adds or removes pixfmts.");
+}
+
+const struct unittest test_repack = {
+    .name = "repack",
+    .run = run,
+};
diff --git a/test/tests.c b/test/tests.c
index 9ef88f4a8d..d8df43f319 100644
--- a/test/tests.c
+++ b/test/tests.c
@@ -12,6 +12,7 @@ static const struct unittest *unittests[] = {
     &test_paths,
     &test_repack_sws,
 #if HAVE_ZIMG
+    &test_repack, // zimg only due to cross-checking with zimg.c
     &test_repack_zimg,
 #endif
     NULL
@@ -128,3 +129,25 @@ void assert_text_files_equal_impl(const char *file, int line,
         abort();
     }
 }
+
+static void hexdump(const uint8_t *d, size_t size)
+{
+    printf("|");
+    while (size--) {
+        printf(" %02x", d[0]);
+        d++;
+    }
+    printf(" |\n");
+}
+
+void assert_memcmp_impl(const char *file, int line,
+                        const void *a, const void *b, size_t size)
+{
+    if (memcmp(a, b, size) == 0)
+        return;
+
+    printf("%s:%d: mismatching data:\n", file, line);
+    hexdump(a, size);
+    hexdump(b, size);
+    abort();
+}
diff --git a/test/tests.h b/test/tests.h
index f4065f596f..8b2eb98174 100644
--- a/test/tests.h
+++ b/test/tests.h
@@ -43,6 +43,7 @@ extern const struct unittest test_json;
 extern const struct unittest test_linked_list;
 extern const struct unittest test_repack_sws;
 extern const struct unittest test_repack_zimg;
+extern const struct unittest test_repack;
 extern const struct unittest test_paths;
 
 #define assert_true(x) assert(x)
@@ -54,6 +55,10 @@ extern const struct unittest test_paths;
 #define assert_float_equal(a, b, tolerance) \
     assert_float_equal_impl(__FILE__, __LINE__, (a), (b), (tolerance))
 
+// Assert that memcmp(a,b,s)==0, or hexdump output on failure.
+#define assert_memcmp(a, b, s) \
+    assert_memcmp_impl(__FILE__, __LINE__, (a), (b), (s))
+
 // Require that the files "ref" and "new" are the same. The paths can be
 // relative to ref_path and out_path respectively. If they're not the same,
 // the output of "diff" is shown, the err message (if not NULL), and the test
@@ -69,6 +74,8 @@ void assert_float_equal_impl(const char *file, int line,
 void assert_text_files_equal_impl(const char *file, int line,
                                   struct test_ctx *ctx, const char *ref,
                                   const char *new, const char *err);
+void assert_memcmp_impl(const char *file, int line,
+                        const void *a, const void *b, size_t size);
 
 // Open a new file in the out_path. Always succeeds.
 FILE *test_open_out(struct test_ctx *ctx, const char *name);
diff --git a/video/img_format.h b/video/img_format.h
index 8e55cc9493..b0fdef8a50 100644
--- a/video/img_format.h
+++ b/video/img_format.h
@@ -69,8 +69,9 @@ struct mp_imgfmt_desc {
     int flags;              // MP_IMGFLAG_* bitfield
     int8_t num_planes;
     int8_t chroma_xs, chroma_ys; // chroma shift (i.e. log2 of chroma pixel size)
-    int8_t align_x, align_y;     // pixel size to get byte alignment and to get
+    int8_t align_x, align_y;     // pixel count to get byte alignment and to get
                                  // to a pixel pos where luma & chroma aligns
+                                 // always power of 2
     int8_t bytes[MP_MAX_PLANES]; // bytes per pixel (MP_IMGFLAG_BYTE_ALIGNED)
     int8_t bpp[MP_MAX_PLANES];   // bits per pixel
     int8_t plane_bits;           // number of bits in use for plane 0
diff --git a/video/repack.c b/video/repack.c
new file mode 100644
index 0000000000..359e32996d
--- /dev/null
+++ b/video/repack.c
@@ -0,0 +1,1110 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+
+#include <libavutil/bswap.h>
+#include <libavutil/pixfmt.h>
+
+#include "common/common.h"
+#include "repack.h"
+#include "video/fmt-conversion.h"
+#include "video/img_format.h"
+#include "video/mp_image.h"
+
+enum repack_step_type {
+    REPACK_STEP_REPACK,
+    REPACK_STEP_ENDIAN,
+};
+
+struct repack_step {
+    enum repack_step_type type;
+    // 0=input, 1=output
+    struct mp_image *buf[2];
+    bool user_buf[2]; // user_buf[n]==true if buf[n] = user src/dst buffer
+    struct mp_imgfmt_desc fmt[2];
+    struct mp_image *tmp; // output buffer, if needed
+};
+
+struct mp_repack {
+    bool pack;                  // if false, this is for unpacking
+    int flags;
+    int imgfmt_user;            // original mp format (unchanged endian)
+    int imgfmt_a;               // original mp format (possibly packed format,
+                                // swapped endian)
+    int imgfmt_b;               // equivalent unpacked/planar format
+    struct mp_imgfmt_desc fmt_a;// ==imgfmt_a
+    struct mp_imgfmt_desc fmt_b;// ==imgfmt_b
+
+    void (*repack)(struct mp_repack *rp,
+                   struct mp_image *a, int a_x, int a_y,
+                   struct mp_image *b, int b_x, int b_y, int w);
+
+    bool passthrough_y;         // possible luma plane optimization for e.g. nv12
+    int endian_size;            // endian swap; 0=none, 2/4=swap word size
+
+    // For packed_repack.
+    int components[4];          // b[n] = mp_image.planes[components[n]]
+    //  pack:   a is dst, b is src
+    //  unpack: a is src, b is dst
+    void (*packed_repack_scanline)(void *a, void *b[], int w);
+
+    // Fringe RGB/YUV.
+    uint8_t comp_size;
+    uint8_t *comp_map;
+    uint8_t comp_shifts[3];
+    uint8_t *comp_lut;
+
+    // REPACK_STEP_REPACK: if true, need to copy this plane
+    bool copy_buf[4];
+
+    struct repack_step steps[4];
+    int num_steps;
+
+    bool configured;
+};
+
+// depth = number of LSB in use
+static int find_gbrp_format(int depth, int num_planes)
+{
+    if (num_planes != 3 && num_planes != 4)
+        return 0;
+    struct mp_regular_imgfmt desc = {
+        .component_type = MP_COMPONENT_TYPE_UINT,
+        .forced_csp = MP_CSP_RGB,
+        .component_size = depth > 8 ? 2 : 1,
+        .component_pad = depth - (depth > 8 ? 16 : 8),
+        .num_planes = num_planes,
+        .planes = { {1, {2}}, {1, {3}}, {1, {1}}, {1, {4}} },
+    };
+    return mp_find_regular_imgfmt(&desc);
+}
+
+// depth = number of LSB in use
+static int find_yuv_format(int depth, int num_planes)
+{
+    if (num_planes < 1 || num_planes > 4)
+        return 0;
+    struct mp_regular_imgfmt desc = {
+        .component_type = MP_COMPONENT_TYPE_UINT,
+        .component_size = depth > 8 ? 2 : 1,
+        .component_pad = depth - (depth > 8 ? 16 : 8),
+        .num_planes = num_planes,
+        .planes = { {1, {1}}, {1, {2}}, {1, {3}}, {1, {4}} },
+    };
+    if (num_planes == 2)
+        desc.planes[1].components[0] = 4;
+    return mp_find_regular_imgfmt(&desc);
+}
+
+// Copy one line on the plane p.
+static void copy_plane(struct mp_image *dst, int dst_x, int dst_y,
+                       struct mp_image *src, int src_x, int src_y,
+                       int w, int p)
+{
+    // Number of lines on this plane.
+    int h = (1 << dst->fmt.chroma_ys) - (1 << dst->fmt.ys[p]) + 1;
+    size_t size = mp_image_plane_bytes(dst, p, dst_x, w);
+
+    assert(dst->fmt.bpp[p] == src->fmt.bpp[p]);
+
+    for (int y = 0; y < h; y++) {
+        void *pd = mp_image_pixel_ptr(dst, p, dst_x, dst_y + y);
+        void *ps = mp_image_pixel_ptr(src, p, src_x, src_y + y);
+        memcpy(pd, ps, size);
+    }
+}
+
+// Swap endian for one line.
+static void swap_endian(struct mp_image *dst, int dst_x, int dst_y,
+                        struct mp_image *src, int src_x, int src_y,
+                        int w, int endian_size)
+{
+    assert(src->fmt.num_planes == dst->fmt.num_planes);
+
+    for (int p = 0; p < dst->fmt.num_planes; p++) {
+        int xs = dst->fmt.xs[p];
+        int bpp = dst->fmt.bytes[p];
+        int words_per_pixel = bpp / endian_size;
+        int num_words = ((w + (1 << xs) - 1) >> xs) * words_per_pixel;
+        // Number of lines on this plane.
+        int h = (1 << dst->fmt.chroma_ys) - (1 << dst->fmt.ys[p]) + 1;
+
+        assert(src->fmt.bytes[p] == bpp);
+
+        for (int y = 0; y < h; y++) {
+            void *s = mp_image_pixel_ptr(src, p, src_x, src_y + y);
+            void *d = mp_image_pixel_ptr(dst, p, dst_x, dst_y + y);
+            switch (endian_size) {
+            case 2:
+                for (int x = 0; x < num_words; x++)
+                    ((uint16_t *)d)[x] = av_bswap16(((uint16_t *)s)[x]);
+                break;
+            case 4:
+                for (int x = 0; x < num_words; x++)
+                    ((uint32_t *)d)[x] = av_bswap32(((uint32_t *)s)[x]);
+                break;
+            default:
+                assert(0);
+            }
+        }
+    }
+}
+
+// PA = PAck, copy planar input to single packed array
+// UN = UNpack, copy packed input to planar output
+// Naming convention:
+//  pa_/un_ prefix to identify conversion direction.
+//  Left (LSB, lowest byte address) -> Right (MSB, highest byte address).
+//      (This is unusual; MSB to LSB is more commonly used to describe formats,
+//       but our convention makes more sense for byte access in little endian.)
+//  "c" identifies a color component.
+//  "z" identifies known zero padding.
+//  "x" identifies uninitialized padding.
+//  A component is followed by its size in bits.
+//  Size can be omitted for multiple uniform components (c8c8c8 == ccc8).
+// Unpackers will often use "x" for padding, because they ignore it, while
+// packers will use "z" because they write zero.
+
+#define PA_WORD_4(name, packed_t, plane_t, sh_c0, sh_c1, sh_c2, sh_c3)      \
+    static void name(void *dst, void *src[], int w) {                       \
+        for (int x = 0; x < w; x++) {                                       \
+            ((packed_t *)dst)[x] =                                          \
+                ((packed_t)((plane_t *)src[0])[x] << (sh_c0)) |             \
+                ((packed_t)((plane_t *)src[1])[x] << (sh_c1)) |             \
+                ((packed_t)((plane_t *)src[2])[x] << (sh_c2)) |             \
+                ((packed_t)((plane_t *)src[3])[x] << (sh_c3));              \
+        }                                                                   \
+    }
+
+#define UN_WORD_4(name, packed_t, plane_t, sh_c0, sh_c1, sh_c2, sh_c3, mask)\
+    static void name(void *src, void *dst[], int w) {                       \
+        for (int x = 0; x < w; x++) {                                       \
+            packed_t c = ((packed_t *)src)[x];                              \
+            ((plane_t *)dst[0])[x] = (c >> (sh_c0)) & (mask);               \
+            ((plane_t *)dst[1])[x] = (c >> (sh_c1)) & (mask);               \
+            ((plane_t *)dst[2])[x] = (c >> (sh_c2)) & (mask);               \
+            ((plane_t *)dst[3])[x] = (c >> (sh_c3)) & (mask);               \
+        }                                                                   \
+    }
+
+
+#define PA_WORD_3(name, packed_t, plane_t, sh_c0, sh_c1, sh_c2, pad)        \
+    static void name(void *dst, void *src[], int w) {                       \
+        for (int x = 0; x < w; x++) {                                       \
+            ((packed_t *)dst)[x] = (pad) |                                  \
+                ((packed_t)((plane_t *)src[0])[x] << (sh_c0)) |             \
+                ((packed_t)((plane_t *)src[1])[x] << (sh_c1)) |             \
+                ((packed_t)((plane_t *)src[2])[x] << (sh_c2));              \
+        }                                                                   \
+    }
+
+UN_WORD_4(un_cccc8,  uint32_t, uint8_t,  0, 8,  16, 24, 0xFFu)
+PA_WORD_4(pa_cccc8,  uint32_t, uint8_t,  0, 8,  16, 24)
+// Not sure if this is a good idea; there may be no alignment guarantee.
+UN_WORD_4(un_cccc16,  uint64_t, uint16_t,  0, 16,  32, 48, 0xFFFFu)
+PA_WORD_4(pa_cccc16,  uint64_t, uint16_t,  0, 16,  32, 48)
+
+#define UN_WORD_3(name, packed_t, plane_t, sh_c0, sh_c1, sh_c2, mask)       \
+    static void name(void *src, void *dst[], int w) {                       \
+        for (int x = 0; x < w; x++) {                                       \
+            packed_t c = ((packed_t *)src)[x];                              \
+            ((plane_t *)dst[0])[x] = (c >> (sh_c0)) & (mask);               \
+            ((plane_t *)dst[1])[x] = (c >> (sh_c1)) & (mask);               \
+            ((plane_t *)dst[2])[x] = (c >> (sh_c2)) & (mask);               \
+        }                                                                   \
+    }
+
+UN_WORD_3(un_ccc8x8,  uint32_t, uint8_t,  0, 8,  16, 0xFFu)
+PA_WORD_3(pa_ccc8z8,  uint32_t, uint8_t,  0, 8,  16, 0)
+UN_WORD_3(un_x8ccc8,  uint32_t, uint8_t,  8, 16, 24, 0xFFu)
+PA_WORD_3(pa_z8ccc8,  uint32_t, uint8_t,  8, 16, 24, 0)
+UN_WORD_3(un_ccc10x2, uint32_t, uint16_t, 0, 10, 20, 0x3FFu)
+PA_WORD_3(pa_ccc10z2, uint32_t, uint16_t, 20, 10, 0, 0)
+
+#define PA_WORD_2(name, packed_t, plane_t, sh_c0, sh_c1, pad)               \
+    static void name(void *dst, void *src[], int w) {                       \
+        for (int x = 0; x < w; x++) {                                       \
+            ((packed_t *)dst)[x] = (pad) |                                  \
+                ((packed_t)((plane_t *)src[0])[x] << (sh_c0)) |             \
+                ((packed_t)((plane_t *)src[1])[x] << (sh_c1));              \
+        }                                                                   \
+    }
+
+#define UN_WORD_2(name, packed_t, plane_t, sh_c0, sh_c1, mask)              \
+    static void name(void *src, void *dst[], int w) {                       \
+        for (int x = 0; x < w; x++) {                                       \
+            packed_t c = ((packed_t *)src)[x];                              \
+            ((plane_t *)dst[0])[x] = (c >> (sh_c0)) & (mask);               \
+            ((plane_t *)dst[1])[x] = (c >> (sh_c1)) & (mask);               \
+        }                                                                   \
+    }
+
+UN_WORD_2(un_cc8,  uint16_t, uint8_t,  0, 8,  0xFFu)
+PA_WORD_2(pa_cc8,  uint16_t, uint8_t,  0, 8,  0)
+UN_WORD_2(un_cc16, uint32_t, uint16_t, 0, 16, 0xFFFFu)
+PA_WORD_2(pa_cc16, uint32_t, uint16_t, 0, 16, 0)
+
+#define PA_SEQ_3(name, comp_t)                                              \
+    static void name(void *dst, void *src[], int w) {                       \
+        comp_t *r = dst;                                                    \
+        for (int x = 0; x < w; x++) {                                       \
+            *r++ = ((comp_t *)src[0])[x];                                   \
+            *r++ = ((comp_t *)src[1])[x];                                   \
+            *r++ = ((comp_t *)src[2])[x];                                   \
+        }                                                                   \
+    }
+
+#define UN_SEQ_3(name, comp_t)                                              \
+    static void name(void *src, void *dst[], int w) {                       \
+        comp_t *r = src;                                                    \
+        for (int x = 0; x < w; x++) {                                       \
+            ((comp_t *)dst[0])[x] = *r++;                                   \
+            ((comp_t *)dst[1])[x] = *r++;                                   \
+            ((comp_t *)dst[2])[x] = *r++;                                   \
+        }                                                                   \
+    }
+
+UN_SEQ_3(un_ccc8,  uint8_t)
+PA_SEQ_3(pa_ccc8,  uint8_t)
+UN_SEQ_3(un_ccc16, uint16_t)
+PA_SEQ_3(pa_ccc16, uint16_t)
+
+// "regular": single packed plane, all components have same width (except padding)
+struct regular_repacker {
+    int packed_width;       // number of bits of the packed pixel
+    int component_width;    // number of bits for a single component
+    int prepadding;         // number of bits of LSB padding
+    int num_components;     // number of components that can be accessed
+    void (*pa_scanline)(void *a, void *b[], int w);
+    void (*un_scanline)(void *a, void *b[], int w);
+};
+
+static const struct regular_repacker regular_repackers[] = {
+    {32, 8,  0, 3, pa_ccc8z8,  un_ccc8x8},
+    {32, 8,  8, 3, pa_z8ccc8,  un_x8ccc8},
+    {32, 8,  0, 4, pa_cccc8,   un_cccc8},
+    {64, 16, 0, 4, pa_cccc16,  un_cccc16},
+    {24, 8,  0, 3, pa_ccc8,    un_ccc8},
+    {48, 16, 0, 3, pa_ccc16,   un_ccc16},
+    {16, 8,  0, 2, pa_cc8,     un_cc8},
+    {32, 16, 0, 2, pa_cc16,    un_cc16},
+    {32, 10, 0, 3, pa_ccc10z2, un_ccc10x2},
+};
+
+static void packed_repack(struct mp_repack *rp,
+                          struct mp_image *a, int a_x, int a_y,
+                          struct mp_image *b, int b_x, int b_y, int w)
+{
+    uint32_t *pa = mp_image_pixel_ptr(a, 0, a_x, a_y);
+
+    void *pb[4] = {0};
+    for (int p = 0; p < b->num_planes; p++) {
+        int s = rp->components[p];
+        pb[p] = mp_image_pixel_ptr(b, s, b_x, b_y);
+    }
+
+    rp->packed_repack_scanline(pa, pb, w);
+}
+
+// Tries to set a packer/unpacker for component-wise byte aligned formats.
+static void setup_packed_packer(struct mp_repack *rp)
+{
+    struct mp_regular_imgfmt desc;
+    if (!mp_get_regular_imgfmt(&desc, rp->imgfmt_a))
+        return;
+
+    if (desc.num_planes != 1 || desc.planes[0].num_components < 2)
+        return;
+    struct mp_regular_imgfmt_plane *p = &desc.planes[0];
+
+    int num_real_components = 0;
+    bool has_alpha = false;
+    for (int n = 0; n < p->num_components; n++) {
+        if (p->components[n]) {
+            has_alpha |= p->components[n] == 4;
+            num_real_components += 1;
+        } else {
+            // padding must be in MSB or LSB
+            if (n != 0 && n != p->num_components - 1)
+                return;
+        }
+    }
+
+    int depth = desc.component_size * 8 + MPMIN(0, desc.component_pad);
+
+    static const int reorder_gbrp[] = {0, 3, 1, 2, 4};
+    static const int reorder_yuv[] = {0, 1, 2, 3, 4};
+    int planar_fmt = 0;
+    const int *reorder = NULL;
+    if (desc.forced_csp) {
+        if (desc.forced_csp != MP_CSP_RGB && desc.forced_csp != MP_CSP_XYZ)
+            return;
+        planar_fmt = find_gbrp_format(depth, num_real_components);
+        reorder = reorder_gbrp;
+    } else {
+        planar_fmt = find_yuv_format(depth, num_real_components);
+        reorder = reorder_yuv;
+    }
+    if (!planar_fmt)
+        return;
+
+    for (int i = 0; i < MP_ARRAY_SIZE(regular_repackers); i++) {
+        const struct regular_repacker *pa = &regular_repackers[i];
+
+        // The following may assume little endian (because some repack backends
+        // use word access, while the metadata here uses byte access).
+
+        int prepad = p->components[0] ? 0 : 8;
+        int first_comp = p->components[0] ? 0 : 1;
+        void (*repack_cb)(void *pa, void *pb[], int w) =
+            rp->pack ? pa->pa_scanline : pa->un_scanline;
+
+        if (pa->packed_width != desc.component_size * p->num_components * 8 ||
+            pa->component_width != depth ||
+            pa->num_components != num_real_components ||
+            pa->prepadding != prepad ||
+            !repack_cb)
+            continue;
+
+        rp->repack = packed_repack;
+        rp->packed_repack_scanline = repack_cb;
+        rp->imgfmt_b = planar_fmt;
+        for (int n = 0; n < num_real_components; n++) {
+            // Determine permutation that maps component order between the two
+            // formats, with has_alpha special case (see above).
+            int c = reorder[p->components[first_comp + n]];
+            rp->components[n] = c == 4 ? num_real_components - 1 : c - 1;
+        }
+        return;
+    }
+}
+
+struct fringe_rgb_repacker {
+    // To avoid making a mess of IMGFMT_*, we use av formats directly.
+    enum AVPixelFormat avfmt;
+    // If true, use BGR instead of RGB.
+    //  False:  LSB - R - G - B - pad - MSB
+    //  True:   LSB - B - G - R - pad - MSB
+    bool rev_order;
+    // Size in bit for each component, strictly from LSB to MSB.
+    int bits[3];
+    bool be;
+};
+
+static const struct fringe_rgb_repacker fringe_rgb_repackers[] = {
+    {AV_PIX_FMT_BGR4_BYTE,  false,  {1, 2, 1}},
+    {AV_PIX_FMT_RGB4_BYTE,  true,   {1, 2, 1}},
+    {AV_PIX_FMT_BGR8,       false,  {3, 3, 2}},
+    {AV_PIX_FMT_RGB8,       true,   {2, 3, 3}}, // pixdesc desc. and doc. bug?
+    {AV_PIX_FMT_RGB444LE,   true,   {4, 4, 4}},
+    {AV_PIX_FMT_RGB444BE,   true,   {4, 4, 4}, .be = true},
+    {AV_PIX_FMT_BGR444LE,   false,  {4, 4, 4}},
+    {AV_PIX_FMT_BGR444BE,   false,  {4, 4, 4}, .be = true},
+    {AV_PIX_FMT_BGR565LE,   false,  {5, 6, 5}},
+    {AV_PIX_FMT_BGR565BE,   false,  {5, 6, 5}, .be = true},
+    {AV_PIX_FMT_RGB565LE,   true,   {5, 6, 5}},
+    {AV_PIX_FMT_RGB565BE,   true,   {5, 6, 5}, .be = true},
+    {AV_PIX_FMT_BGR555LE,   false,  {5, 5, 5}},
+    {AV_PIX_FMT_BGR555BE,   false,  {5, 5, 5}, .be = true},
+    {AV_PIX_FMT_RGB555LE,   true,   {5, 5, 5}},
+    {AV_PIX_FMT_RGB555BE,   true,   {5, 5, 5}, .be = true},
+};
+
+#define PA_SHIFT_LUT8(name, packed_t)                                       \
+    static void name(void *dst, void *src[], int w, uint8_t *lut,           \
+                     uint8_t s0, uint8_t s1, uint8_t s2) {                  \
+        for (int x = 0; x < w; x++) {                                       \
+            ((packed_t *)dst)[x] =                                          \
+                (lut[((uint8_t *)src[0])[x] + 256 * 0] << s0) |             \
+                (lut[((uint8_t *)src[1])[x] + 256 * 1] << s1) |             \
+                (lut[((uint8_t *)src[2])[x] + 256 * 2] << s2);              \
+        }                                                                   \
+    }
+
+
+#define UN_SHIFT_LUT8(name, packed_t)                                       \
+    static void name(void *src, void *dst[], int w, uint8_t *lut,           \
+                     uint8_t s0, uint8_t s1, uint8_t s2) {                  \
+        for (int x = 0; x < w; x++) {                                       \
+            packed_t c = ((packed_t *)src)[x];                              \
+            ((uint8_t *)dst[0])[x] = lut[((c >> s0) & 0xFF) + 256 * 0];     \
+            ((uint8_t *)dst[1])[x] = lut[((c >> s1) & 0xFF) + 256 * 1];     \
+            ((uint8_t *)dst[2])[x] = lut[((c >> s2) & 0xFF) + 256 * 2];     \
+        }                                                                   \
+    }
+
+PA_SHIFT_LUT8(pa_shift_lut8_8,  uint8_t)
+PA_SHIFT_LUT8(pa_shift_lut8_16, uint16_t)
+UN_SHIFT_LUT8(un_shift_lut8_8,  uint8_t)
+UN_SHIFT_LUT8(un_shift_lut8_16, uint16_t)
+
+static void fringe_rgb_repack(struct mp_repack *rp,
+                              struct mp_image *a, int a_x, int a_y,
+                              struct mp_image *b, int b_x, int b_y, int w)
+{
+    void *pa = mp_image_pixel_ptr(a, 0, a_x, a_y);
+
+    void *pb[4] = {0};
+    for (int p = 0; p < b->num_planes; p++) {
+        int s = rp->components[p];
+        pb[p] = mp_image_pixel_ptr(b, s, b_x, b_y);
+    }
+
+    assert(rp->comp_size == 1 || rp->comp_size == 2);
+
+    void (*repack)(void *pa, void *pb[], int w, uint8_t *lut,
+                   uint8_t s0, uint8_t s1, uint8_t s2) = NULL;
+    if (rp->pack) {
+        repack = rp->comp_size == 1 ? pa_shift_lut8_8 : pa_shift_lut8_16;
+    } else {
+        repack = rp->comp_size == 1 ? un_shift_lut8_8 : un_shift_lut8_16;
+    }
+    repack(pa, pb, w, rp->comp_lut,
+           rp->comp_shifts[0], rp->comp_shifts[1], rp->comp_shifts[2]);
+}
+
+static void setup_fringe_rgb_packer(struct mp_repack *rp)
+{
+    enum AVPixelFormat avfmt = imgfmt2pixfmt(rp->imgfmt_a);
+
+    const struct fringe_rgb_repacker *fmt = NULL;
+    for (int n = 0; n < MP_ARRAY_SIZE(fringe_rgb_repackers); n++) {
+        if (fringe_rgb_repackers[n].avfmt == avfmt) {
+            fmt = &fringe_rgb_repackers[n];
+            break;
+        }
+    }
+
+    if (!fmt)
+        return;
+
+    int depth = fmt->bits[0];
+    for (int n = 0; n < 3; n++) {
+        if (rp->flags & REPACK_CREATE_ROUND_DOWN) {
+            depth = MPMIN(depth, fmt->bits[n]);
+        } else {
+            depth = MPMAX(depth, fmt->bits[n]);
+        }
+    }
+    if (rp->flags & REPACK_CREATE_EXPAND_8BIT)
+        depth = 8;
+
+    rp->imgfmt_b = find_gbrp_format(depth, 3);
+    if (!rp->imgfmt_b)
+        return;
+    rp->comp_lut = talloc_array(rp, uint8_t, 256 * 3);
+    rp->repack = fringe_rgb_repack;
+    static const int c_order_rgb[] = {3, 1, 2};
+    static const int c_order_bgr[] = {2, 1, 3};
+    for (int n = 0; n < 3; n++)
+        rp->components[n] = (fmt->rev_order ? c_order_bgr : c_order_rgb)[n] - 1;
+
+    int bitpos = 0;
+    for (int n = 0; n < 3; n++) {
+        int bits = fmt->bits[n];
+        rp->comp_shifts[n] = bitpos;
+        if (rp->comp_lut) {
+            uint8_t *lut = rp->comp_lut + 256 * n;
+            uint8_t zmax = (1 << depth) - 1;
+            uint8_t cmax = (1 << bits) - 1;
+            for (int v = 0; v < 256; v++) {
+                if (rp->pack) {
+                    lut[v] = (v * cmax + zmax / 2) / zmax;
+                } else {
+                    lut[v] = (v & cmax) * zmax / cmax;
+                }
+            }
+        }
+        bitpos += bits;
+    }
+
+    rp->comp_size = (bitpos + 7) / 8;
+    assert(rp->comp_size == 1 || rp->comp_size == 2);
+
+    if (fmt->be) {
+        assert(rp->comp_size == 2);
+        rp->endian_size = 2;
+    }
+}
+
+static void unpack_pal(struct mp_repack *rp,
+                       struct mp_image *a, int a_x, int a_y,
+                       struct mp_image *b, int b_x, int b_y, int w)
+{
+    uint8_t *src = mp_image_pixel_ptr(a, 0, a_x, a_y);
+    uint32_t *pal = (void *)a->planes[1];
+
+    uint8_t *dst[4] = {0};
+    for (int p = 0; p < b->num_planes; p++)
+        dst[p] = mp_image_pixel_ptr(b, p, b_x, b_y);
+
+    for (int x = 0; x < w; x++) {
+        uint32_t c = pal[src[x]];
+        dst[0][x] = (c >>  8) & 0xFF; // G
+        dst[1][x] = (c >>  0) & 0xFF; // B
+        dst[2][x] = (c >> 16) & 0xFF; // R
+        dst[3][x] = (c >> 24) & 0xFF; // A
+    }
+}
+
+static void bitmap_repack(struct mp_repack *rp,
+                          struct mp_image *a, int a_x, int a_y,
+                          struct mp_image *b, int b_x, int b_y, int w)
+{
+    uint8_t *pa = mp_image_pixel_ptr(a, 0, a_x, a_y);
+    uint8_t *pb = mp_image_pixel_ptr(b, 0, b_x, b_y);
+
+    if (rp->pack) {
+        for (unsigned x = 0; x < w; x += 8) {
+            uint8_t d = 0;
+            int max_b = MPMIN(8, w - x);
+            for (int bp = 0; bp < max_b; bp++)
+                d |= (rp->comp_lut[pb[x + bp]]) << (7 - bp);
+            pa[x / 8] = d;
+        }
+    } else {
+        for (unsigned x = 0; x < w; x += 8) {
+            uint8_t d = pa[x / 8];
+            int max_b = MPMIN(8, w - x);
+            for (int bp = 0; bp < max_b; bp++)
+                pb[x + bp] = rp->comp_lut[d & (1 << (7 - bp))];
+        }
+    }
+}
+
+static void setup_misc_packer(struct mp_repack *rp)
+{
+    // Although it's in regular_repackers[], the generic mpv imgfmt metadata
+    // can't handle it yet.
+    if (rp->imgfmt_a == IMGFMT_RGB30) {
+        int planar_fmt = find_gbrp_format(10, 3);
+        if (!planar_fmt)
+            return;
+        rp->imgfmt_b = planar_fmt;
+        rp->repack = packed_repack;
+        rp->packed_repack_scanline = rp->pack ? pa_ccc10z2 : un_ccc10x2;
+        static int c_order[] = {3, 2, 1};
+        for (int n = 0; n < 3; n++)
+            rp->components[n] = c_order[n] - 1;
+    } else if (rp->imgfmt_a == IMGFMT_PAL8 && !rp->pack) {
+        int grap_fmt = find_gbrp_format(8, 4);
+        if (!grap_fmt)
+            return;
+        rp->imgfmt_b = grap_fmt;
+        rp->repack = unpack_pal;
+    } else {
+        enum AVPixelFormat avfmt = imgfmt2pixfmt(rp->imgfmt_a);
+        if (avfmt == AV_PIX_FMT_MONOWHITE || avfmt == AV_PIX_FMT_MONOBLACK) {
+            rp->comp_lut = talloc_array(rp, uint8_t, 256);
+            rp->imgfmt_b = IMGFMT_Y1;
+            int max = 1;
+            if (rp->flags & REPACK_CREATE_EXPAND_8BIT) {
+                rp->imgfmt_b = IMGFMT_Y8;
+                max = 255;
+            }
+            bool inv = avfmt == AV_PIX_FMT_MONOWHITE;
+            for (int n = 0; n < 256; n++) {
+                rp->comp_lut[n] = rp->pack ? (inv ^ (n >= (max + 1) / 2))
+                                           : ((inv ^ !!n) ? max : 0);
+            }
+            rp->repack = bitmap_repack;
+            return;
+        }
+    }
+}
+
+struct fringe_yuv422_repacker {
+    // To avoid making a mess of IMGFMT_*, we use av formats directly.
+    enum AVPixelFormat avfmt;
+    // In bits (depth/8 rounded up gives byte size)
+    int8_t depth;
+    // Word index of each sample: {y0, y1, cb, cr}
+    uint8_t comp[4];
+    bool be;
+};
+
+static const struct fringe_yuv422_repacker fringe_yuv422_repackers[] = {
+    {AV_PIX_FMT_YUYV422,  8, {0, 2, 1, 3}},
+    {AV_PIX_FMT_UYVY422,  8, {1, 3, 0, 2}},
+    {AV_PIX_FMT_YVYU422,  8, {0, 2, 3, 1}},
+#ifdef AV_PIX_FMT_Y210
+    {AV_PIX_FMT_Y210LE,  10, {0, 2, 1, 3}},
+    {AV_PIX_FMT_Y210BE,  10, {0, 2, 1, 3}, .be = true},
+#endif
+};
+
+#define PA_P422(name, comp_t)                                               \
+    static void name(void *dst, void *src[], int w, uint8_t *c) {           \
+        for (int x = 0; x < w; x += 2) {                                    \
+            ((comp_t *)dst)[x * 2 + c[0]] = ((comp_t *)src[0])[x + 0];      \
+            ((comp_t *)dst)[x * 2 + c[1]] = ((comp_t *)src[0])[x + 1];      \
+            ((comp_t *)dst)[x * 2 + c[2]] = ((comp_t *)src[1])[x >> 1];     \
+            ((comp_t *)dst)[x * 2 + c[3]] = ((comp_t *)src[2])[x >> 1];     \
+        }                                                                   \
+    }
+
+
+#define UN_P422(name, comp_t)                                               \
+    static void name(void *src, void *dst[], int w, uint8_t *c) {           \
+        for (int x = 0; x < w; x += 2) {                                    \
+            ((comp_t *)dst[0])[x + 0]  = ((comp_t *)src)[x * 2 + c[0]];     \
+            ((comp_t *)dst[0])[x + 1]  = ((comp_t *)src)[x * 2 + c[1]];     \
+            ((comp_t *)dst[1])[x >> 1] = ((comp_t *)src)[x * 2 + c[2]];     \
+            ((comp_t *)dst[2])[x >> 1] = ((comp_t *)src)[x * 2 + c[3]];     \
+        }                                                                   \
+    }
+
+PA_P422(pa_p422_8,  uint8_t)
+PA_P422(pa_p422_16, uint16_t)
+UN_P422(un_p422_8,  uint8_t)
+UN_P422(un_p422_16, uint16_t)
+
+static void fringe_yuv422_repack(struct mp_repack *rp,
+                                 struct mp_image *a, int a_x, int a_y,
+                                 struct mp_image *b, int b_x, int b_y, int w)
+{
+    void *pa = mp_image_pixel_ptr(a, 0, a_x, a_y);
+
+    void *pb[4] = {0};
+    for (int p = 0; p < b->num_planes; p++)
+        pb[p] = mp_image_pixel_ptr(b, p, b_x, b_y);
+
+    assert(rp->comp_size == 1 || rp->comp_size == 2);
+
+    void (*repack)(void *a, void *b[], int w, uint8_t *c) = NULL;
+    if (rp->pack) {
+        repack = rp->comp_size == 1 ? pa_p422_8 : pa_p422_16;
+    } else {
+        repack = rp->comp_size == 1 ? un_p422_8 : un_p422_16;
+    }
+    repack(pa, pb, w, rp->comp_map);
+}
+
+static void setup_fringe_yuv422_packer(struct mp_repack *rp)
+{
+    enum AVPixelFormat avfmt = imgfmt2pixfmt(rp->imgfmt_a);
+
+    const struct fringe_yuv422_repacker *fmt = NULL;
+    for (int n = 0; n < MP_ARRAY_SIZE(fringe_yuv422_repackers); n++) {
+        if (fringe_yuv422_repackers[n].avfmt == avfmt) {
+            fmt = &fringe_yuv422_repackers[n];
+            break;
+        }
+    }
+
+    if (!fmt)
+        return;
+
+    rp->comp_size = (fmt->depth + 7) / 8;
+    assert(rp->comp_size == 1 || rp->comp_size == 2);
+
+    struct mp_regular_imgfmt yuvfmt = {
+        .component_type = MP_COMPONENT_TYPE_UINT,
+        // NB: same problem with P010 and not clearing padding.
+        .component_size = rp->comp_size,
+        .num_planes = 3,
+        .planes = { {1, {1}}, {1, {2}}, {1, {3}} },
+        .chroma_xs = 1,
+        .chroma_ys = 0,
+    };
+    rp->imgfmt_b = mp_find_regular_imgfmt(&yuvfmt);
+    rp->repack = fringe_yuv422_repack;
+    rp->comp_map = (uint8_t *)fmt->comp;
+
+    if (fmt->be) {
+        assert(rp->comp_size == 2);
+        rp->endian_size = 2;
+    }
+}
+
+static void repack_nv(struct mp_repack *rp,
+                      struct mp_image *a, int a_x, int a_y,
+                      struct mp_image *b, int b_x, int b_y, int w)
+{
+    int xs = a->fmt.chroma_xs;
+
+    uint32_t *pa = mp_image_pixel_ptr(a, 1, a_x, a_y);
+
+    void *pb[2];
+    for (int p = 0; p < 2; p++) {
+        int s = rp->components[p];
+        pb[p] = mp_image_pixel_ptr(b, s, b_x, b_y);
+    }
+
+    rp->packed_repack_scanline(pa, pb, (w + (1 << xs) - 1) >> xs);
+}
+
+static void setup_nv_packer(struct mp_repack *rp)
+{
+    struct mp_regular_imgfmt desc;
+    if (!mp_get_regular_imgfmt(&desc, rp->imgfmt_a))
+        return;
+
+    // Check for NV.
+    if (desc.num_planes != 2)
+        return;
+    if (desc.planes[0].num_components != 1 || desc.planes[0].components[0] != 1)
+        return;
+    if (desc.planes[1].num_components != 2)
+        return;
+    int cr0 = desc.planes[1].components[0];
+    int cr1 = desc.planes[1].components[1];
+    if (cr0 > cr1)
+        MPSWAP(int, cr0, cr1);
+    if (cr0 != 2 || cr1 != 3)
+        return;
+
+    // Construct equivalent planar format.
+    struct mp_regular_imgfmt desc2 = desc;
+    desc2.num_planes = 3;
+    desc2.planes[1].num_components = 1;
+    desc2.planes[1].components[0] = 2;
+    desc2.planes[2].num_components = 1;
+    desc2.planes[2].components[0] = 3;
+    // For P010. Strangely this concept exists only for the NV format.
+    if (desc2.component_pad > 0)
+        desc2.component_pad = 0;
+
+    int planar_fmt = mp_find_regular_imgfmt(&desc2);
+    if (!planar_fmt)
+        return;
+
+    for (int i = 0; i < MP_ARRAY_SIZE(regular_repackers); i++) {
+        const struct regular_repacker *pa = &regular_repackers[i];
+
+        void (*repack_cb)(void *pa, void *pb[], int w) =
+            rp->pack ? pa->pa_scanline : pa->un_scanline;
+
+        if (pa->packed_width != desc.component_size * 2 * 8 ||
+            pa->component_width != desc.component_size * 8 ||
+            pa->num_components != 2 ||
+            pa->prepadding != 0 ||
+            !repack_cb)
+            continue;
+
+        rp->repack = repack_nv;
+        rp->passthrough_y = true;
+        rp->packed_repack_scanline = repack_cb;
+        rp->imgfmt_b = planar_fmt;
+        rp->components[0] = desc.planes[1].components[0] - 1;
+        rp->components[1] = desc.planes[1].components[1] - 1;
+        return;
+    }
+}
+
+void repack_line(struct mp_repack *rp, int dst_x, int dst_y,
+                 int src_x, int src_y, int w)
+{
+    assert(rp->configured);
+
+    struct repack_step *first = &rp->steps[0];
+    struct repack_step *last = &rp->steps[rp->num_steps - 1];
+
+    assert(dst_x >= 0 && dst_y >= 0 && src_x >= 0 && src_y >= 0 && w >= 0);
+    assert(dst_x + w <= MP_ALIGN_UP(last->buf[1]->w, last->fmt[1].align_x));
+    assert(src_x + w <= MP_ALIGN_UP(first->buf[1]->w, first->fmt[0].align_x));
+    assert(dst_y < last->buf[1]->h);
+    assert(src_y < first->buf[0]->h);
+    assert(!(dst_x & (last->fmt[1].align_x - 1)));
+    assert(!(src_x & (first->fmt[0].align_x - 1)));
+    assert(!(w & ((1 << first->fmt[0].chroma_xs) - 1)));
+    assert(!(dst_y & (last->fmt[1].align_y - 1)));
+    assert(!(src_y & (first->fmt[0].align_y - 1)));
+
+    for (int n = 0; n < rp->num_steps; n++) {
+        struct repack_step *rs = &rp->steps[n];
+
+        // When writing to temporary buffers, always write to the start (maybe
+        // helps with locality).
+        int sx = rs->user_buf[0] ? src_x : 0;
+        int sy = rs->user_buf[0] ? src_y : 0;
+        int dx = rs->user_buf[1] ? dst_x : 0;
+        int dy = rs->user_buf[1] ? dst_y : 0;
+
+        struct mp_image *buf_a = rs->buf[rp->pack];
+        struct mp_image *buf_b = rs->buf[!rp->pack];
+        int a_x = rp->pack ? dx : sx;
+        int a_y = rp->pack ? dy : sy;
+        int b_x = rp->pack ? sx : dx;
+        int b_y = rp->pack ? sy : dy;
+
+        switch (rs->type) {
+        case REPACK_STEP_REPACK: {
+            if (rp->repack)
+                rp->repack(rp, buf_a, a_x, a_y, buf_b, b_x, b_y, w);
+
+            for (int p = 0; p < rs->fmt[0].num_planes; p++) {
+                if (rp->copy_buf[p])
+                    copy_plane(rs->buf[1], dx, dy, rs->buf[0], sx, sy, w, p);
+            }
+            break;
+        }
+        case REPACK_STEP_ENDIAN:
+            swap_endian(rs->buf[1], dx, dy, rs->buf[0], sx, sy, w,
+                        rp->endian_size);
+            break;
+        }
+    }
+}
+
+static bool setup_format_ne(struct mp_repack *rp)
+{
+    if (!rp->imgfmt_b)
+        setup_nv_packer(rp);
+    if (!rp->imgfmt_b)
+        setup_misc_packer(rp);
+    if (!rp->imgfmt_b)
+        setup_packed_packer(rp);
+    if (!rp->imgfmt_b)
+        setup_fringe_rgb_packer(rp);
+    if (!rp->imgfmt_b)
+        setup_fringe_yuv422_packer(rp);
+    if (!rp->imgfmt_b)
+        rp->imgfmt_b = rp->imgfmt_a; // maybe it was planar after all
+
+    struct mp_regular_imgfmt desc;
+    if (!mp_get_regular_imgfmt(&desc, rp->imgfmt_b))
+        return false;
+
+    // no weird stuff
+    if (desc.num_planes > 4)
+        return false;
+
+    // Endian swapping.
+    if (rp->imgfmt_a != rp->imgfmt_user) {
+        struct mp_regular_imgfmt ndesc;
+        if (!mp_get_regular_imgfmt(&ndesc, rp->imgfmt_a) || ndesc.num_planes > 4)
+            return false;
+        rp->endian_size = ndesc.component_size;
+        if (rp->endian_size != 2 && rp->endian_size != 4)
+            return false;
+    }
+
+    // Accept only true planar formats (with known components and no padding).
+    for (int n = 0; n < desc.num_planes; n++) {
+        if (desc.planes[n].num_components != 1)
+            return false;
+        int c = desc.planes[n].components[0];
+        if (c < 1 || c > 4)
+            return false;
+    }
+
+    rp->fmt_a = mp_imgfmt_get_desc(rp->imgfmt_a);
+    rp->fmt_b = mp_imgfmt_get_desc(rp->imgfmt_b);
+
+    // This is if we did a pack step.
+
+    rp->steps[rp->num_steps++] = (struct repack_step) {
+        .type = REPACK_STEP_REPACK,
+        .fmt = { rp->fmt_b, rp->fmt_a },
+    };
+
+    if (rp->endian_size) {
+        rp->steps[rp->num_steps++] = (struct repack_step) {
+            .type = REPACK_STEP_ENDIAN,
+            .fmt = {
+                rp->fmt_a,
+                mp_imgfmt_get_desc(rp->imgfmt_user),
+            },
+        };
+    }
+
+    // Reverse if unpack (to reflect actual data flow)
+    if (!rp->pack) {
+        for (int n = 0; n < rp->num_steps / 2; n++) {
+            MPSWAP(struct repack_step, rp->steps[n],
+                   rp->steps[rp->num_steps - 1 - n]);
+        }
+        for (int n = 0; n < rp->num_steps; n++) {
+            struct repack_step *rs = &rp->steps[n];
+            MPSWAP(struct mp_imgfmt_desc, rs->fmt[0], rs->fmt[1]);
+        }
+    }
+
+    for (int n = 0; n < rp->num_steps - 1; n++)
+        assert(rp->steps[n].fmt[1].id == rp->steps[n + 1].fmt[0].id);
+
+    return true;
+}
+
+static void reset_params(struct mp_repack *rp)
+{
+    rp->num_steps = 0;
+    rp->imgfmt_b = 0;
+    rp->repack = NULL;
+    rp->passthrough_y = false;
+    rp->endian_size = 0;
+    rp->packed_repack_scanline = NULL;
+    rp->comp_size = 0;
+    rp->comp_map = NULL;
+    talloc_free(rp->comp_lut);
+    rp->comp_lut = NULL;
+}
+
+static bool setup_format(struct mp_repack *rp)
+{
+    reset_params(rp);
+    rp->imgfmt_a = rp->imgfmt_user;
+    if (setup_format_ne(rp))
+        return true;
+    // Try reverse endian.
+    reset_params(rp);
+    rp->imgfmt_a = mp_find_other_endian(rp->imgfmt_user);
+    return rp->imgfmt_a && setup_format_ne(rp);
+}
+
+struct mp_repack *mp_repack_create_planar(int imgfmt, bool pack, int flags)
+{
+    struct mp_repack *rp = talloc_zero(NULL, struct mp_repack);
+    rp->imgfmt_user = imgfmt;
+    rp->pack = pack;
+    rp->flags = flags;
+
+    if (!setup_format(rp)) {
+        talloc_free(rp);
+        return NULL;
+    }
+
+    return rp;
+}
+
+int mp_repack_get_format_src(struct mp_repack *rp)
+{
+    return rp->steps[0].fmt[0].id;
+}
+
+int mp_repack_get_format_dst(struct mp_repack *rp)
+{
+    return rp->steps[rp->num_steps - 1].fmt[1].id;
+}
+
+int mp_repack_get_align_x(struct mp_repack *rp)
+{
+    // We really want the LCM between those, but since only one of them is
+    // packed (or they're the same format), and the chroma subsampling is the
+    // same for both, only the packed one matters.
+    return rp->fmt_a.align_x;
+}
+
+int mp_repack_get_align_y(struct mp_repack *rp)
+{
+    return rp->fmt_a.align_y; // should be the same for packed/planar formats
+}
+
+static void image_realloc(struct mp_image **img, int fmt, int w, int h)
+{
+    if (*img && (*img)->imgfmt == fmt && (*img)->w == w && (*img)->h == h)
+        return;
+    talloc_free(*img);
+    *img = mp_image_alloc(fmt, w, h);
+}
+
+bool repack_config_buffers(struct mp_repack *rp,
+                           int dst_flags, struct mp_image *dst,
+                           int src_flags, struct mp_image *src,
+                           bool *enable_passthrough)
+{
+    struct repack_step *rs_first = &rp->steps[0];
+    struct repack_step *rs_last = &rp->steps[rp->num_steps - 1];
+
+    rp->configured = false;
+
+    assert(dst && src);
+
+    int buf_w = MPMAX(dst->w, src->w);
+
+    assert(dst->imgfmt == rs_last->fmt[1].id);
+    assert(src->imgfmt == rs_first->fmt[0].id);
+
+    // Chain/allocate buffers.
+
+    for (int n = 0; n < rp->num_steps; n++)
+        rp->steps[n].buf[0] = rp->steps[n].buf[1] = NULL;
+
+    rs_first->buf[0] = src;
+    rs_last->buf[1] = dst;
+
+    for (int n = 0; n < rp->num_steps; n++) {
+        struct repack_step *rs = &rp->steps[n];
+
+        if (!rs->buf[0]) {
+            assert(n > 0);
+            rs->buf[0] = rp->steps[n - 1].buf[1];
+        }
+
+        if (rs->buf[1])
+            continue;
+
+        // Note: since repack_line() can have different src/dst offsets, we
+        //       can't do true in-place in general.
+        bool can_inplace = rs->type == REPACK_STEP_ENDIAN &&
+                           rs->buf[0] != src && rs->buf[0] != dst;
+        if (can_inplace) {
+            rs->buf[1] = rs->buf[0];
+            continue;
+        }
+
+        if (rs != rs_last) {
+            struct repack_step *next = &rp->steps[n + 1];
+            if (next->buf[0]) {
+                rs->buf[1] = next->buf[0];
+                continue;
+            }
+        }
+
+        image_realloc(&rs->tmp, rs->fmt[1].id, buf_w, rs->fmt[1].align_y);
+        if (!rs->tmp)
+            return false;
+        talloc_steal(rp, rs->tmp);
+        rs->buf[1] = rs->tmp;
+    }
+
+    for (int n = 0; n < rp->num_steps; n++) {
+        struct repack_step *rs = &rp->steps[n];
+        rs->user_buf[0] = rs->buf[0] == src || rs->buf[0] == dst;
+        rs->user_buf[1] = rs->buf[1] == src || rs->buf[1] == dst;
+    }
+
+    // If repacking is the only operation. It's also responsible for simply
+    // copying src to dst if absolutely no filtering is done.
+    bool may_passthrough =
+        rp->num_steps == 1 && rp->steps[0].type == REPACK_STEP_REPACK;
+
+    for (int p = 0; p < rp->fmt_b.num_planes; p++) {
+        // (All repack callbacks copy, except nv12 does not copy luma.)
+        bool repack_copies_plane = rp->repack && !(rp->passthrough_y && p == 0);
+
+        bool can_pt = may_passthrough && !repack_copies_plane &&
+                      enable_passthrough && enable_passthrough[p];
+
+        // Copy if needed, unless the repack callback does it anyway.
+        rp->copy_buf[p] = !repack_copies_plane && !can_pt;
+
+        if (enable_passthrough)
+            enable_passthrough[p] = can_pt && !rp->copy_buf[p];
+    }
+
+    if (enable_passthrough) {
+        for (int n = rp->fmt_b.num_planes; n < MP_MAX_PLANES; n++)
+            enable_passthrough[n] = false;
+    }
+
+    rp->configured = true;
+
+    return true;
+}
diff --git a/video/repack.h b/video/repack.h
new file mode 100644
index 0000000000..fa81ca9df2
--- /dev/null
+++ b/video/repack.h
@@ -0,0 +1,72 @@
+#pragma once
+
+#include <stdbool.h>
+
+enum {
+    // This controls bheavior with different bit widths per component (like
+    // RGB565). If ROUND_DOWN is specified, the planar format will use the min.
+    // bit width of all components, otherwise the transformation is lossless.
+    REPACK_CREATE_ROUND_DOWN    = (1 << 0),
+
+    // Expand some (not all) low bit depth fringe formats to 8 bit on unpack.
+    REPACK_CREATE_EXPAND_8BIT   = (1 << 1),
+};
+
+struct mp_repack;
+struct mp_image;
+
+// Create a repacker between any format (imgfmt parameter) and an equivalent
+// planar format (that is native endian). If pack==true, imgfmt is the output,
+// otherwise it is the input. The respective other input/output is the planar
+// format. The planar format can be queried with mp_repack_get_format_*().
+// Note that some formats may change the "implied" colorspace (for example,
+// packed xyz unpacks as rgb).
+// If imgfmt is already planar, a passthrough repacker may be created.
+//  imgfmt: src or dst format (usually packed, non-planar, etc.)
+//  pack: true if imgfmt is dst, false if imgfmt is src
+//  flags: any of REPACK_CREATE_* flags
+//  returns: NULL on failure, otherwise free with talloc_free().
+struct mp_repack *mp_repack_create_planar(int imgfmt, bool pack, int flags);
+
+// Return input and output formats for which rp was created.
+int mp_repack_get_format_src(struct mp_repack *rp);
+int mp_repack_get_format_dst(struct mp_repack *rp);
+
+// Return pixel alignment. For x, this is a lowest pixel count at which there is
+// a byte boundary and a full chroma pixel (horizontal subsampling) on src/dst.
+// For y, this is the pixel height of the vertical subsampling.
+// Always returns a power of 2.
+int mp_repack_get_align_x(struct mp_repack *rp);
+int mp_repack_get_align_y(struct mp_repack *rp);
+
+// Repack a single line from dst to src, as set in repack_config_buffers().
+// For subsampled chroma formats, this copies as many luma/alpha rows as needed
+// for a complete line (e.g. 2 luma lines, 1 chroma line for 4:2:0).
+// dst_x, src_x, y must be aligned to the pixel alignment. w may be unaligned
+// if at the right crop-border of the image, but must be always aligned to
+// horiz. sub-sampling. y is subject to hslice.
+void repack_line(struct mp_repack *rp, int dst_x, int dst_y,
+                 int src_x, int src_y, int w);
+
+// Configure with a source and target buffer. The rp instance will keep the
+// mp_image pointers and access them on repack_line() calls. Refcounting is
+// not respected - the caller needs to make sure dst is always writable.
+// The images can have different sizes (as repack_line() lets you use different
+// target coordinates for dst/src).
+// This also allocaters potentially required temporary buffers.
+//  dst_flags: REPACK_BUF_* flags for dst
+//  dst: where repack_line() writes to
+//  src_flags: REPACK_BUF_* flags for src
+//  src: where repack_line() reads from
+//  enable_passthrough: if non-NULL, an bool array of size MP_MAX_PLANES indexed
+//                      by plane; a true entry requests disabling copying the
+//                      plane data to the dst plane. The function will write to
+//                      this array whether the plane can really be passed through
+//                      (i.e. will set array entries from true to false if pass-
+//                      through is not possible). It writes to all MP_MAX_PLANES
+//                      entries. If NULL, all entries are implicitly false.
+//  returns: success (fails on OOM)
+bool repack_config_buffers(struct mp_repack *rp,
+                           int dst_flags, struct mp_image *dst,
+                           int src_flags, struct mp_image *src,
+                           bool *enable_passthrough);
diff --git a/video/zimg.c b/video/zimg.c
index ae3602d297..4e7711f61a 100644
--- a/video/zimg.c
+++ b/video/zimg.c
@@ -25,6 +25,7 @@
 #include "csputils.h"
 #include "options/m_config.h"
 #include "options/m_option.h"
+#include "repack.h"
 #include "video/fmt-conversion.h"
 #include "video/img_format.h"
 #include "zimg.h"
@@ -83,50 +84,14 @@ struct mp_zimg_repack {
     int num_planes;             // number of planes involved
     unsigned zmask[4];          // zmask[mp_index] = zimg mask (using mp index!)
     int z_planes[4];            // z_planes[zimg_index] = mp_index (or -1)
-    bool pass_through_y;        // luma plane optimization for e.g. nv12
 
-    // If set, the pack/unpack callback to pass to zimg.
-    // Called with user==mp_zimg_repack.
-    zimg_filter_graph_callback repack;
-
-    // Endian-swap (done before/after actual repacker).
-    int endian_size;            // 0=no swapping, 2/4=word byte size to swap
-    int endian_items[4];        // number of words per pixel/plane
-
-    // For packed_repack.
-    int components[4];          // p2[n] = mp_image.planes[components[n]]
-    //  pack:   p1 is dst, p2 is src
-    //  unpack: p1 is src, p2 is dst
-    void (*packed_repack_scanline)(void *p1, void *p2[], int x0, int x1);
-
-    // Fringe RGB/YUV.
-    uint8_t comp_size;
-    uint8_t *comp_map;
-    uint8_t comp_shifts[3];
-    uint8_t *comp_lut; // 256 * 3
+    struct mp_repack *repack;   // converting to/from planar
 
     // Temporary memory for slice-wise repacking. This may be set even if repack
     // is not set (then it may be used to avoid alignment issues). This has
     // about one slice worth of data.
     struct mp_image *tmp;
 
-    // Temporary memory for endian swapping. This has about one slice worth
-    // of data; set and used only if endian swapping is used (endian_size>0).
-    // It's also used only for pack==false; packers do this in-place.
-    struct mp_image *tmp_endian;
-
-    // Temporary, per-call source/target frame.
-    struct mp_image *mpi;
-    // Y coordinate of first line in mpi; usually 0 if mpi==user_mpi, or the
-    // start of the current slice (in the current repack cb).
-    // repackers should use: mpi->data[p] + mpi->stride[p] * (i - mpi_y0)
-    int mpi_y0;
-
-    struct mp_image *user_mpi;
-
-    // Also temporary, per-call. use_buf[n] == plane n uses tmp (and not mpi).
-    bool use_buf[4];
-
     int real_w, real_h;         // aligned size
 };
 
@@ -243,532 +208,44 @@ void mp_zimg_enable_cmdline_opts(struct mp_zimg_context *ctx,
     mp_zimg_update_from_cmdline(ctx); // first update
 }
 
-static int repack_align(void *user, unsigned i, unsigned x0, unsigned x1)
-{
-    struct mp_zimg_repack *r = user;
-
-    for (int p = 0; p < r->mpi->fmt.num_planes; p++) {
-        if (!r->use_buf[p])
-            continue;
-
-        int bpp = r->mpi->fmt.bytes[p];
-        int xs = r->mpi->fmt.xs[p];
-        int ys = r->mpi->fmt.ys[p];
-        // Number of lines on this plane.
-        int h = (1 << r->mpi->fmt.chroma_ys) - (1 << ys) + 1;
-
-        for (int y = i; y < i + h; y++) {
-            void *a = r->mpi->planes[p] +
-                      r->mpi->stride[p] * (ptrdiff_t)((y - r->mpi_y0) >> ys) +
-                      bpp * (x0 >> xs);
-            void *b = r->tmp->planes[p] +
-                      r->tmp->stride[p] * (ptrdiff_t)((y >> ys) & r->zmask[p]) +
-                      bpp * (x0 >> xs);
-            size_t size = ((x1 - x0) >> xs) * bpp;
-            if (r->pack) {
-                memcpy(a, b, size);
-            } else {
-                memcpy(b, a, size);
-            }
-        }
-    }
-
-    return 0;
-}
-
-// Swap endian for one line.
-static void swap_endian(struct mp_zimg_repack *r, struct mp_image *dst, int dst_y,
-                        struct mp_image *src, int src_y, int x0, int x1)
-{
-    for (int p = 0; p < dst->fmt.num_planes; p++) {
-        int xs = dst->fmt.xs[p];
-        int ys = dst->fmt.ys[p];
-        int words_per_pixel = r->endian_items[p];
-        int bpp = words_per_pixel * r->endian_size;
-        // Number of lines on this plane.
-        int h = (1 << dst->fmt.chroma_ys) - (1 << ys) + 1;
-        int num_words = ((x1 - x0) >> xs) * words_per_pixel;
-
-        for (int y = 0; y < h; y++) {
-            void *s = src->planes[p] +
-                      src->stride[p] * (ptrdiff_t)((y + src_y) >> ys) +
-                      bpp * (x0 >> xs);
-            void *d = dst->planes[p] +
-                      dst->stride[p] * (ptrdiff_t)((y + dst_y) >> ys) +
-                      bpp * (x0 >> xs);
-            switch (r->endian_size) {
-            case 2:
-                for (int w = 0; w < num_words; w++)
-                    ((uint16_t *)d)[w] = av_bswap16(((uint16_t *)s)[w]);
-                break;
-            case 4:
-                for (int w = 0; w < num_words; w++)
-                    ((uint32_t *)d)[w] = av_bswap32(((uint32_t *)s)[w]);
-                break;
-            default:
-                assert(0);
-            }
-        }
-    }
-}
-
-// PA = PAck, copy planar input to single packed array
-// UN = UNpack, copy packed input to planar output
-// Naming convention:
-//  pa_/un_ prefix to identify conversion direction.
-//  Left (LSB, lowest byte address) -> Right (MSB, highest byte address).
-//      (This is unusual; MSB to LSB is more commonly used to describe formats,
-//       but our convention makes more sense for byte access in little endian.)
-//  "c" identifies a color component.
-//  "z" identifies known zero padding.
-//  "x" identifies uninitialized padding.
-//  A component is followed by its size in bits.
-//  Size can be omitted for multiple uniform components (c8c8c8 == ccc8).
-// Unpackers will often use "x" for padding, because they ignore it, while
-// packers will use "z" because they write zero.
-
-#define PA_WORD_4(name, packed_t, plane_t, sh_c0, sh_c1, sh_c2, sh_c3)      \
-    static void name(void *dst, void *src[], int x0, int x1) {              \
-        for (int x = x0; x < x1; x++) {                                     \
-            ((packed_t *)dst)[x] =                                          \
-                ((packed_t)((plane_t *)src[0])[x] << (sh_c0)) |             \
-                ((packed_t)((plane_t *)src[1])[x] << (sh_c1)) |             \
-                ((packed_t)((plane_t *)src[2])[x] << (sh_c2)) |             \
-                ((packed_t)((plane_t *)src[3])[x] << (sh_c3));              \
-        }                                                                   \
-    }
-
-#define UN_WORD_4(name, packed_t, plane_t, sh_c0, sh_c1, sh_c2, sh_c3, mask)\
-    static void name(void *src, void *dst[], int x0, int x1) {              \
-        for (int x = x0; x < x1; x++) {                                     \
-            packed_t c = ((packed_t *)src)[x];                              \
-            ((plane_t *)dst[0])[x] = (c >> (sh_c0)) & (mask);               \
-            ((plane_t *)dst[1])[x] = (c >> (sh_c1)) & (mask);               \
-            ((plane_t *)dst[2])[x] = (c >> (sh_c2)) & (mask);               \
-            ((plane_t *)dst[3])[x] = (c >> (sh_c3)) & (mask);               \
-        }                                                                   \
-    }
-
-
-#define PA_WORD_3(name, packed_t, plane_t, sh_c0, sh_c1, sh_c2, pad)        \
-    static void name(void *dst, void *src[], int x0, int x1) {              \
-        for (int x = x0; x < x1; x++) {                                     \
-            ((packed_t *)dst)[x] = (pad) |                                  \
-                ((packed_t)((plane_t *)src[0])[x] << (sh_c0)) |             \
-                ((packed_t)((plane_t *)src[1])[x] << (sh_c1)) |             \
-                ((packed_t)((plane_t *)src[2])[x] << (sh_c2));              \
-        }                                                                   \
-    }
-
-UN_WORD_4(un_cccc8,  uint32_t, uint8_t,  0, 8,  16, 24, 0xFFu)
-PA_WORD_4(pa_cccc8,  uint32_t, uint8_t,  0, 8,  16, 24)
-// Not sure if this is a good idea; there may be no alignment guarantee.
-UN_WORD_4(un_cccc16,  uint64_t, uint16_t,  0, 16,  32, 48, 0xFFFFu)
-PA_WORD_4(pa_cccc16,  uint64_t, uint16_t,  0, 16,  32, 48)
-
-#define UN_WORD_3(name, packed_t, plane_t, sh_c0, sh_c1, sh_c2, mask)       \
-    static void name(void *src, void *dst[], int x0, int x1) {              \
-        for (int x = x0; x < x1; x++) {                                     \
-            packed_t c = ((packed_t *)src)[x];                              \
-            ((plane_t *)dst[0])[x] = (c >> (sh_c0)) & (mask);               \
-            ((plane_t *)dst[1])[x] = (c >> (sh_c1)) & (mask);               \
-            ((plane_t *)dst[2])[x] = (c >> (sh_c2)) & (mask);               \
-        }                                                                   \
-    }
-
-UN_WORD_3(un_ccc8x8,  uint32_t, uint8_t,  0, 8,  16, 0xFFu)
-PA_WORD_3(pa_ccc8z8,  uint32_t, uint8_t,  0, 8,  16, 0)
-UN_WORD_3(un_x8ccc8,  uint32_t, uint8_t,  8, 16, 24, 0xFFu)
-PA_WORD_3(pa_z8ccc8,  uint32_t, uint8_t,  8, 16, 24, 0)
-UN_WORD_3(un_ccc10x2, uint32_t, uint16_t, 0, 10, 20, 0x3FFu)
-PA_WORD_3(pa_ccc10z2, uint32_t, uint16_t, 20, 10, 0, 0)
-
-#define PA_WORD_2(name, packed_t, plane_t, sh_c0, sh_c1, pad)               \
-    static void name(void *dst, void *src[], int x0, int x1) {              \
-        for (int x = x0; x < x1; x++) {                                     \
-            ((packed_t *)dst)[x] = (pad) |                                  \
-                ((packed_t)((plane_t *)src[0])[x] << (sh_c0)) |             \
-                ((packed_t)((plane_t *)src[1])[x] << (sh_c1));              \
-        }                                                                   \
-    }
-
-#define UN_WORD_2(name, packed_t, plane_t, sh_c0, sh_c1, mask)              \
-    static void name(void *src, void *dst[], int x0, int x1) {              \
-        for (int x = x0; x < x1; x++) {                                     \
-            packed_t c = ((packed_t *)src)[x];                              \
-            ((plane_t *)dst[0])[x] = (c >> (sh_c0)) & (mask);               \
-            ((plane_t *)dst[1])[x] = (c >> (sh_c1)) & (mask);               \
-        }                                                                   \
-    }
-
-UN_WORD_2(un_cc8,  uint16_t, uint8_t,  0, 8,  0xFFu)
-PA_WORD_2(pa_cc8,  uint16_t, uint8_t,  0, 8,  0)
-UN_WORD_2(un_cc16, uint32_t, uint16_t, 0, 16, 0xFFFFu)
-PA_WORD_2(pa_cc16, uint32_t, uint16_t, 0, 16, 0)
-
-#define PA_SEQ_3(name, comp_t)                                              \
-    static void name(void *dst, void *src[], int x0, int x1) {              \
-        comp_t *r = dst;                                                    \
-        for (int x = x0; x < x1; x++) {                                     \
-            *r++ = ((comp_t *)src[0])[x];                                   \
-            *r++ = ((comp_t *)src[1])[x];                                   \
-            *r++ = ((comp_t *)src[2])[x];                                   \
-        }                                                                   \
-    }
-
-#define UN_SEQ_3(name, comp_t)                                              \
-    static void name(void *src, void *dst[], int x0, int x1) {              \
-        comp_t *r = src;                                                    \
-        for (int x = x0; x < x1; x++) {                                     \
-            ((comp_t *)dst[0])[x] = *r++;                                   \
-            ((comp_t *)dst[1])[x] = *r++;                                   \
-            ((comp_t *)dst[2])[x] = *r++;                                   \
-        }                                                                   \
-    }
-
-UN_SEQ_3(un_ccc8,  uint8_t)
-PA_SEQ_3(pa_ccc8,  uint8_t)
-UN_SEQ_3(un_ccc16, uint16_t)
-PA_SEQ_3(pa_ccc16, uint16_t)
-
-// "regular": single packed plane, all components have same width (except padding)
-struct regular_repacker {
-    int packed_width;       // number of bits of the packed pixel
-    int component_width;    // number of bits for a single component
-    int prepadding;         // number of bits of LSB padding
-    int num_components;     // number of components that can be accessed
-    void (*pa_scanline)(void *p1, void *p2[], int x0, int x1);
-    void (*un_scanline)(void *p1, void *p2[], int x0, int x1);
-};
-
-static const struct regular_repacker regular_repackers[] = {
-    {32, 8,  0, 3, pa_ccc8z8,  un_ccc8x8},
-    {32, 8,  8, 3, pa_z8ccc8,  un_x8ccc8},
-    {32, 8,  0, 4, pa_cccc8,   un_cccc8},
-    {64, 16, 0, 4, pa_cccc16,  un_cccc16},
-    {24, 8,  0, 3, pa_ccc8,    un_ccc8},
-    {48, 16, 0, 3, pa_ccc16,   un_ccc16},
-    {16, 8,  0, 2, pa_cc8,     un_cc8},
-    {32, 16, 0, 2, pa_cc16,    un_cc16},
-    {32, 10, 0, 3, pa_ccc10z2, un_ccc10x2},
-};
-
-static int packed_repack(void *user, unsigned i, unsigned x0, unsigned x1)
-{
-    struct mp_zimg_repack *r = user;
-
-    uint32_t *p1 = (void *)(r->mpi->planes[0] +
-                            r->mpi->stride[0] * (ptrdiff_t)(i - r->mpi_y0));
-
-    void *p2[4] = {0};
-    for (int p = 0; p < r->num_planes; p++) {
-        int s = r->components[p];
-        p2[p] = r->tmp->planes[s] +
-                r->tmp->stride[s] * (ptrdiff_t)(i & r->zmask[s]);
-    }
-
-    r->packed_repack_scanline(p1, p2, x0, x1);
-
-    return 0;
-}
-
-struct fringe_rgb_repacker {
-    // To avoid making a mess of IMGFMT_*, we use av formats directly.
-    enum AVPixelFormat avfmt;
-    // If true, use BGR instead of RGB.
-    //  False:  LSB - R - G - B - pad - MSB
-    //  True:   LSB - B - G - R - pad - MSB
-    bool rev_order;
-    // Size in bit for each component, strictly from LSB to MSB.
-    int bits[3];
-    bool be;
-};
-
-static const struct fringe_rgb_repacker fringe_rgb_repackers[] = {
-    {AV_PIX_FMT_BGR4_BYTE,  false,  {1, 2, 1}},
-    {AV_PIX_FMT_RGB4_BYTE,  true,   {1, 2, 1}},
-    {AV_PIX_FMT_BGR8,       false,  {3, 3, 2}},
-    {AV_PIX_FMT_RGB8,       true,   {2, 3, 3}}, // pixdesc desc. and doc. bug?
-    {AV_PIX_FMT_RGB444LE,   true,   {4, 4, 4}},
-    {AV_PIX_FMT_RGB444BE,   true,   {4, 4, 4}, .be = true},
-    {AV_PIX_FMT_BGR444LE,   false,  {4, 4, 4}},
-    {AV_PIX_FMT_BGR444BE,   false,  {4, 4, 4}, .be = true},
-    {AV_PIX_FMT_BGR565LE,   false,  {5, 6, 5}},
-    {AV_PIX_FMT_BGR565BE,   false,  {5, 6, 5}, .be = true},
-    {AV_PIX_FMT_RGB565LE,   true,   {5, 6, 5}},
-    {AV_PIX_FMT_RGB565BE,   true,   {5, 6, 5}, .be = true},
-    {AV_PIX_FMT_BGR555LE,   false,  {5, 5, 5}},
-    {AV_PIX_FMT_BGR555BE,   false,  {5, 5, 5}, .be = true},
-    {AV_PIX_FMT_RGB555LE,   true,   {5, 5, 5}},
-    {AV_PIX_FMT_RGB555BE,   true,   {5, 5, 5}, .be = true},
-};
-
-#define PA_SHIFT_LUT8(name, packed_t)                                       \
-    static void name(void *dst, void *src[], int x0, int x1, uint8_t *lut,  \
-                     uint8_t s0, uint8_t s1, uint8_t s2) {                  \
-        for (int x = x0; x < x1; x++) {                                     \
-            ((packed_t *)dst)[x] =                                          \
-                (lut[((uint8_t *)src[0])[x] + 256 * 0] << s0) |             \
-                (lut[((uint8_t *)src[1])[x] + 256 * 1] << s1) |             \
-                (lut[((uint8_t *)src[2])[x] + 256 * 2] << s2);              \
-        }                                                                   \
-    }
-
-
-#define UN_SHIFT_LUT8(name, packed_t)                                       \
-    static void name(void *src, void *dst[], int x0, int x1, uint8_t *lut,  \
-                     uint8_t s0, uint8_t s1, uint8_t s2) {                  \
-        for (int x = x0; x < x1; x++) {                                     \
-            packed_t c = ((packed_t *)src)[x];                              \
-            ((uint8_t *)dst[0])[x] = lut[((c >> s0) & 0xFF) + 256 * 0];     \
-            ((uint8_t *)dst[1])[x] = lut[((c >> s1) & 0xFF) + 256 * 1];     \
-            ((uint8_t *)dst[2])[x] = lut[((c >> s2) & 0xFF) + 256 * 2];     \
-        }                                                                   \
-    }
-
-PA_SHIFT_LUT8(pa_shift_lut8_8,  uint8_t)
-PA_SHIFT_LUT8(pa_shift_lut8_16, uint16_t)
-UN_SHIFT_LUT8(un_shift_lut8_8,  uint8_t)
-UN_SHIFT_LUT8(un_shift_lut8_16, uint16_t)
-
-static int fringe_rgb_repack(void *user, unsigned i, unsigned x0, unsigned x1)
-{
-    struct mp_zimg_repack *r = user;
-
-    void *p1 = r->mpi->planes[0] + r->mpi->stride[0] * (ptrdiff_t)(i - r->mpi_y0);
-
-    void *p2[4] = {0};
-    for (int p = 0; p < r->num_planes; p++) {
-        int s = r->components[p];
-        p2[p] = r->tmp->planes[s] +
-                r->tmp->stride[s] * (ptrdiff_t)(i & r->zmask[s]);
-    }
-
-    assert(r->comp_size == 1 || r->comp_size == 2);
-
-    void (*repack)(void *p1, void *p2[], int x0, int x1, uint8_t *lut,
-                   uint8_t s0, uint8_t s1, uint8_t s2) = NULL;
-    if (r->pack) {
-        repack = r->comp_size == 1 ? pa_shift_lut8_8 : pa_shift_lut8_16;
-    } else {
-        repack = r->comp_size == 1 ? un_shift_lut8_8 : un_shift_lut8_16;
-    }
-    repack(p1, p2, x0, x1, r->comp_lut,
-           r->comp_shifts[0], r->comp_shifts[1], r->comp_shifts[2]);
-
-    return 0;
-}
-
-static int bitmap_repack(void *user, unsigned i, unsigned x0, unsigned x1)
-{
-    struct mp_zimg_repack *r = user;
-
-    uint8_t *p1 =
-        r->mpi->planes[0] + r->mpi->stride[0] * (ptrdiff_t)(i - r->mpi_y0);
-    uint8_t *p2 =
-        r->tmp->planes[0] + r->tmp->stride[0] * (ptrdiff_t)(i & r->zmask[0]);
-
-    uint8_t swap = r->comp_size ? 0xFF : 0;
-    if (r->pack) {
-        // Supposedly zimg aligns this at least on 64 byte boundaries. Simplifies a
-        // lot for us.
-        assert(!(x0 & 7));
-
-        for (int x = x0; x < x1; x += 8) {
-            uint8_t d = 0;
-            int max_b = MPMIN(8, x1 - x);
-            for (int b = 0; b < max_b; b++)
-                d |= (!!p2[x + b]) << (7 - b);
-            p1[x / 8] = d ^ swap;
-        }
-    } else {
-        x0 &= ~0x7;
-
-        for (int x = x0; x < x1; x += 8) {
-            uint8_t d = p1[x / 8] ^ swap;
-            int max_b = MPMIN(8, x1 - x);
-            for (int b = 0; b < max_b; b++)
-                p2[x + b] = !!(d & (1 << (7 - b)));
-        }
-    }
-
-    return 0;
-}
-
-static int unpack_pal(void *user, unsigned i, unsigned x0, unsigned x1)
-{
-    struct mp_zimg_repack *r = user;
-
-    uint8_t *src = (void *)(r->mpi->planes[0] +
-                            r->mpi->stride[0] * (ptrdiff_t)(i - r->mpi_y0));
-    uint32_t *pal = (void *)r->mpi->planes[1];
-
-    uint8_t *dst[4] = {0};
-    for (int p = 0; p < r->num_planes; p++) {
-        dst[p] = r->tmp->planes[p] +
-                 r->tmp->stride[p] * (ptrdiff_t)(i & r->zmask[p]);
-    }
-
-    for (int x = x0; x < x1; x++) {
-        uint32_t c = pal[src[x]];
-        dst[0][x] = (c >>  8) & 0xFF; // G
-        dst[1][x] = (c >>  0) & 0xFF; // B
-        dst[2][x] = (c >> 16) & 0xFF; // R
-        dst[3][x] = (c >> 24) & 0xFF; // A
-    }
-
-    return 0;
-}
-
-struct fringe_yuv422_repacker {
-    // To avoid making a mess of IMGFMT_*, we use av formats directly.
-    enum AVPixelFormat avfmt;
-    // In bits (depth/8 rounded up gives byte size)
-    int8_t depth;
-    // Word index of each sample: {y0, y1, cb, cr}
-    uint8_t comp[4];
-    bool be;
-};
-
-static const struct fringe_yuv422_repacker fringe_yuv422_repackers[] = {
-    {AV_PIX_FMT_YUYV422,  8, {0, 2, 1, 3}},
-    {AV_PIX_FMT_UYVY422,  8, {1, 3, 0, 2}},
-    {AV_PIX_FMT_YVYU422,  8, {0, 2, 3, 1}},
-#ifdef AV_PIX_FMT_Y210
-    {AV_PIX_FMT_Y210LE,  10, {0, 2, 1, 3}},
-    {AV_PIX_FMT_Y210BE,  10, {0, 2, 1, 3}, .be = true},
-#endif
-};
-
-#define PA_P422(name, comp_t)                                               \
-    static void name(void *dst, void *src[], int x0, int x1, uint8_t *c) {  \
-        for (int x = x0; x < x1; x += 2) {                                  \
-            ((comp_t *)dst)[x * 2 + c[0]] = ((comp_t *)src[0])[x + 0];      \
-            ((comp_t *)dst)[x * 2 + c[1]] = ((comp_t *)src[0])[x + 1];      \
-            ((comp_t *)dst)[x * 2 + c[2]] = ((comp_t *)src[1])[x >> 1];     \
-            ((comp_t *)dst)[x * 2 + c[3]] = ((comp_t *)src[2])[x >> 1];     \
-        }                                                                   \
-    }
-
-
-#define UN_P422(name, comp_t)                                               \
-    static void name(void *src, void *dst[], int x0, int x1, uint8_t *c) {  \
-        for (int x = x0; x < x1; x += 2) {                                  \
-            ((comp_t *)dst[0])[x + 0]  = ((comp_t *)src)[x * 2 + c[0]];     \
-            ((comp_t *)dst[0])[x + 1]  = ((comp_t *)src)[x * 2 + c[1]];     \
-            ((comp_t *)dst[1])[x >> 1] = ((comp_t *)src)[x * 2 + c[2]];     \
-            ((comp_t *)dst[2])[x >> 1] = ((comp_t *)src)[x * 2 + c[3]];     \
-        }                                                                   \
-    }
-
-PA_P422(pa_p422_8,  uint8_t)
-PA_P422(pa_p422_16, uint16_t)
-UN_P422(un_p422_8,  uint8_t)
-UN_P422(un_p422_16, uint16_t)
-
-static int fringe_yuv422_repack(void *user, unsigned i, unsigned x0, unsigned x1)
-{
-    struct mp_zimg_repack *r = user;
-
-    void *p1 = r->mpi->planes[0] + r->mpi->stride[0] * (ptrdiff_t)(i - r->mpi_y0);
-
-    void *p2[4] = {0};
-    for (int p = 0; p < r->num_planes; p++) {
-        p2[p] = r->tmp->planes[p] +
-                r->tmp->stride[p] * (ptrdiff_t)(i & r->zmask[p]);
-    }
-
-    assert(r->comp_size == 1 || r->comp_size == 2);
-
-    void (*repack)(void *p1, void *p2[], int x0, int x1, uint8_t *c) = NULL;
-    if (r->pack) {
-        repack = r->comp_size == 1 ? pa_p422_8 : pa_p422_16;
-    } else {
-        repack = r->comp_size == 1 ? un_p422_8 : un_p422_16;
-    }
-    repack(p1, p2, x0, x1, r->comp_map);
-
-    return 0;
-}
-
-static int repack_nv(void *user, unsigned i, unsigned x0, unsigned x1)
-{
-    struct mp_zimg_repack *r = user;
-
-    int xs = r->mpi->fmt.chroma_xs;
-    int ys = r->mpi->fmt.chroma_ys;
-
-    if (r->use_buf[0]) {
-        // Copy Y.
-        int l_h = 1 << ys;
-        for (int y = i; y < i + l_h; y++) {
-            ptrdiff_t bpp = r->mpi->fmt.bytes[0];
-            void *a = r->mpi->planes[0] +
-                    r->mpi->stride[0] * (ptrdiff_t)(y - r->mpi_y0) + bpp * x0;
-            void *b = r->tmp->planes[0] +
-                    r->tmp->stride[0] * (ptrdiff_t)(y & r->zmask[0]) + bpp * x0;
-            size_t size = (x1 - x0) * bpp;
-            if (r->pack) {
-                memcpy(a, b, size);
-            } else {
-                memcpy(b, a, size);
-            }
-        }
-    }
-
-    uint32_t *p1 = (void *)(r->mpi->planes[1] +
-                            r->mpi->stride[1] * (ptrdiff_t)((i - r->mpi_y0) >> ys));
-
-    void *p2[2];
-    for (int p = 0; p < 2; p++) {
-        int s = r->components[p];
-        p2[p] = r->tmp->planes[s] +
-                r->tmp->stride[s] * (ptrdiff_t)((i >> ys) & r->zmask[s]);
-    }
-
-    r->packed_repack_scanline(p1, p2, x0 >> xs, x1 >> xs);
-
-    return 0;
-}
-
 static int repack_entrypoint(void *user, unsigned i, unsigned x0, unsigned x1)
 {
     struct mp_zimg_repack *r = user;
 
-    if (r->endian_size && !r->pack) {
-        r->mpi = r->tmp_endian;
-        r->mpi_y0 = i;
-        swap_endian(r, r->mpi, 0, r->user_mpi, i, x0, x1);
-    } else {
-        r->mpi = r->user_mpi;
-        r->mpi_y0 = 0;
-    }
+    // If reading is not aligned, just read slightly more data.
+    if (!r->pack)
+        x0 &= ~(unsigned)(mp_repack_get_align_x(r->repack) - 1);
 
-    if (r->repack) {
-        r->repack(r, i, x0, x1);
-    } else {
-        repack_align(r, i, x0, x1);
-    }
+    // mp_repack requirements and zimg guarantees.
+    assert(!(i & (mp_repack_get_align_y(r->repack) - 1)));
+    assert(!(x0 & (mp_repack_get_align_x(r->repack) - 1)));
 
-    if (r->endian_size && r->pack)
-        swap_endian(r, r->user_mpi, i, r->mpi, i - r->mpi_y0, x0, x1);
+    unsigned i_src = i & (r->pack ? r->zmask[0] : ZIMG_BUFFER_MAX);
+    unsigned i_dst = i & (r->pack ? ZIMG_BUFFER_MAX : r->zmask[0]);
+
+    repack_line(r->repack, x0, i_dst, x0, i_src, x1 - x0);
 
-    r->mpi = NULL;
     return 0;
 }
 
-static void wrap_buffer(struct mp_zimg_repack *r,
+static bool wrap_buffer(struct mp_zimg_repack *r,
                         zimg_image_buffer *buf,
                         struct mp_image *mpi)
 {
     *buf = (zimg_image_buffer){ZIMG_API_VERSION};
 
-    bool plane_aligned[4] = {0};
-    for (int n = 0; n < r->num_planes; n++) {
-        plane_aligned[n] = !((uintptr_t)mpi->planes[n] % ZIMG_ALIGN) &&
-                           !(mpi->stride[n] % ZIMG_ALIGN);
+    bool direct[MP_MAX_PLANES] = {0};
+
+    for (int p = 0; p < mpi->num_planes; p++) {
+        // If alignment is good, try to avoid copy.
+        direct[p] = !((uintptr_t)mpi->planes[p] % ZIMG_ALIGN) &&
+                    !(mpi->stride[p] % ZIMG_ALIGN);
     }
 
+    if (!repack_config_buffers(r->repack, 0, r->pack ? mpi : r->tmp,
+                                          0, r->pack ? r->tmp : mpi, direct))
+        return false;
+
     for (int n = 0; n < MP_ARRAY_SIZE(buf->plane); n++) {
         // Note: this is really the only place we have to care about plane
         // permutation (zimg_image_buffer may have a different plane order
@@ -778,355 +255,67 @@ static void wrap_buffer(struct mp_zimg_repack *r,
         if (mplane < 0)
             continue;
 
-        r->use_buf[mplane] = !plane_aligned[mplane] || r->endian_size;
-        if (!(r->pass_through_y && mplane == 0))
-            r->use_buf[mplane] |= !!r->repack;
-
-        struct mp_image *tmpi = r->use_buf[mplane] ? r->tmp : mpi;
+        struct mp_image *tmpi = direct[mplane] ? mpi : r->tmp;
         buf->plane[n].data = tmpi->planes[mplane];
         buf->plane[n].stride = tmpi->stride[mplane];
-        buf->plane[n].mask = r->use_buf[mplane] ? r->zmask[mplane]
-                                                : ZIMG_BUFFER_MAX;
+        buf->plane[n].mask = direct[mplane] ? ZIMG_BUFFER_MAX : r->zmask[mplane];
     }
 
-    r->user_mpi = mpi;
+    return true;
 }
 
-// depth = number of LSB in use
-static int find_gbrp_format(int depth, int num_planes)
+// (ctx can be NULL for probing.)
+static bool setup_format(zimg_image_format *zfmt, struct mp_zimg_repack *r,
+                         bool pack, struct mp_image_params *user_fmt,
+                         struct mp_zimg_context *ctx)
 {
-    if (num_planes != 3 && num_planes != 4)
-        return 0;
-    struct mp_regular_imgfmt desc = {
-        .component_type = MP_COMPONENT_TYPE_UINT,
-        .forced_csp = MP_CSP_RGB,
-        .component_size = depth > 8 ? 2 : 1,
-        .component_pad = depth - (depth > 8 ? 16 : 8),
-        .num_planes = num_planes,
-        .planes = { {1, {2}}, {1, {3}}, {1, {1}}, {1, {4}} },
-    };
-    return mp_find_regular_imgfmt(&desc);
-}
+    r->fmt = *user_fmt;
+    r->pack = pack;
 
-// depth = number of LSB in use
-static int find_gray_format(int depth, int num_planes)
-{
-    if (num_planes != 1 && num_planes != 2)
-        return 0;
-    struct mp_regular_imgfmt desc = {
-        .component_type = MP_COMPONENT_TYPE_UINT,
-        .component_size = depth > 8 ? 2 : 1,
-        .component_pad = depth - (depth > 8 ? 16 : 8),
-        .num_planes = num_planes,
-        .planes = { {1, {1}}, {1, {4}} },
-    };
-    return mp_find_regular_imgfmt(&desc);
-}
-
-static void setup_fringe_rgb_packer(struct mp_zimg_repack *r,
-                                    struct mp_zimg_context *ctx)
-{
-    enum AVPixelFormat avfmt = imgfmt2pixfmt(r->zimgfmt);
-
-    const struct fringe_rgb_repacker *fmt = NULL;
-    for (int n = 0; n < MP_ARRAY_SIZE(fringe_rgb_repackers); n++) {
-        if (fringe_rgb_repackers[n].avfmt == avfmt) {
-            fmt = &fringe_rgb_repackers[n];
-            break;
-        }
-    }
-
-    if (!fmt)
-        return;
-
-    int depth = 8;
-    if (r->pack) {
-        // Dither to lowest depth - loses some precision, but result is saner.
-        depth = fmt->bits[0];
-        for (int n = 0; n < 3; n++)
-            depth = MPMIN(depth, fmt->bits[n]);
-    }
-
-    r->zimgfmt = find_gbrp_format(depth, 3);
-    if (!r->zimgfmt)
-        return;
-    if (ctx)
-        r->comp_lut = talloc_array(ctx, uint8_t, 256 * 3);
-    r->repack = fringe_rgb_repack;
-    static const int c_order_rgb[] = {3, 1, 2};
-    static const int c_order_bgr[] = {2, 1, 3};
-    for (int n = 0; n < 3; n++)
-        r->components[n] = (fmt->rev_order ? c_order_bgr : c_order_rgb)[n] - 1;
-
-    int bitpos = 0;
-    for (int n = 0; n < 3; n++) {
-        int bits = fmt->bits[n];
-        r->comp_shifts[n] = bitpos;
-        if (r->comp_lut) {
-            uint8_t *lut = r->comp_lut + 256 * n;
-            uint8_t zmax = (1 << depth) - 1;
-            uint8_t cmax = (1 << bits) - 1;
-            for (int v = 0; v < 256; v++) {
-                if (r->pack) {
-                    lut[v] = (v * cmax + zmax / 2) / zmax;
-                } else {
-                    lut[v] = (v & cmax) * zmax / cmax;
-                }
-            }
-        }
-        bitpos += bits;
-    }
-
-    r->comp_size = (bitpos + 7) / 8;
-    assert(r->comp_size == 1 || r->comp_size == 2);
-
-    if (fmt->be) {
-        assert(r->comp_size == 2);
-        r->endian_size = 2;
-        r->endian_items[0] = 1;
-    }
-}
-
-static void setup_fringe_yuv422_packer(struct mp_zimg_repack *r)
-{
-    enum AVPixelFormat avfmt = imgfmt2pixfmt(r->zimgfmt);
-
-    const struct fringe_yuv422_repacker *fmt = NULL;
-    for (int n = 0; n < MP_ARRAY_SIZE(fringe_yuv422_repackers); n++) {
-        if (fringe_yuv422_repackers[n].avfmt == avfmt) {
-            fmt = &fringe_yuv422_repackers[n];
-            break;
-        }
-    }
-
-    if (!fmt)
-        return;
-
-    r->comp_size = (fmt->depth + 7) / 8;
-    assert(r->comp_size == 1 || r->comp_size == 2);
-
-    struct mp_regular_imgfmt yuvfmt = {
-        .component_type = MP_COMPONENT_TYPE_UINT,
-        // NB: same problem with P010 and not clearing padding.
-        .component_size = r->comp_size,
-        .num_planes = 3,
-        .planes = { {1, {1}}, {1, {2}}, {1, {3}} },
-        .chroma_xs = 1,
-        .chroma_ys = 0,
-    };
-    r->zimgfmt = mp_find_regular_imgfmt(&yuvfmt);
-    r->repack = fringe_yuv422_repack;
-    r->comp_map = (uint8_t *)fmt->comp;
-
-    if (fmt->be) {
-        assert(r->comp_size == 2);
-        r->endian_size = 2;
-        r->endian_items[0] = 4;
-    }
-}
-
-static void setup_nv_packer(struct mp_zimg_repack *r)
-{
-    struct mp_regular_imgfmt desc;
-    if (!mp_get_regular_imgfmt(&desc, r->zimgfmt))
-        return;
-
-    // Check for NV.
-    if (desc.num_planes != 2)
-        return;
-    if (desc.planes[0].num_components != 1 || desc.planes[0].components[0] != 1)
-        return;
-    if (desc.planes[1].num_components != 2)
-        return;
-    int cr0 = desc.planes[1].components[0];
-    int cr1 = desc.planes[1].components[1];
-    if (cr0 > cr1)
-        MPSWAP(int, cr0, cr1);
-    if (cr0 != 2 || cr1 != 3)
-        return;
-
-    // Construct equivalent planar format.
-    struct mp_regular_imgfmt desc2 = desc;
-    desc2.num_planes = 3;
-    desc2.planes[1].num_components = 1;
-    desc2.planes[1].components[0] = 2;
-    desc2.planes[2].num_components = 1;
-    desc2.planes[2].components[0] = 3;
-    // For P010. Strangely this concept exists only for the NV format.
-    if (desc2.component_pad > 0)
-        desc2.component_pad = 0;
-
-    int planar_fmt = mp_find_regular_imgfmt(&desc2);
-    if (!planar_fmt)
-        return;
-
-    for (int i = 0; i < MP_ARRAY_SIZE(regular_repackers); i++) {
-        const struct regular_repacker *pa = &regular_repackers[i];
-
-        void (*repack_cb)(void *p1, void *p2[], int x0, int x1) =
-            r->pack ? pa->pa_scanline : pa->un_scanline;
-
-        if (pa->packed_width != desc.component_size * 2 * 8 ||
-            pa->component_width != desc.component_size * 8 ||
-            pa->num_components != 2 ||
-            pa->prepadding != 0 ||
-            !repack_cb)
-            continue;
-
-        r->repack = repack_nv;
-        r->pass_through_y = true;
-        r->packed_repack_scanline = repack_cb;
-        r->zimgfmt = planar_fmt;
-        r->components[0] = desc.planes[1].components[0] - 1;
-        r->components[1] = desc.planes[1].components[1] - 1;
-        return;
-    }
-}
-
-static void setup_misc_packer(struct mp_zimg_repack *r)
-{
-    // Although it's in regular_repackers[], the generic mpv imgfmt metadata
-    // can't handle it yet.
-    if (r->zimgfmt == IMGFMT_RGB30) {
-        int planar_fmt = find_gbrp_format(10, 3);
-        if (!planar_fmt)
-            return;
-        r->zimgfmt = planar_fmt;
-        r->repack = packed_repack;
-        r->packed_repack_scanline = r->pack ? pa_ccc10z2 : un_ccc10x2;
-        static int c_order[] = {3, 2, 1};
-        for (int n = 0; n < 3; n++)
-            r->components[n] = c_order[n] - 1;
-    } else if (r->zimgfmt == IMGFMT_PAL8 && !r->pack) {
-        int grap_fmt = find_gbrp_format(8, 4);
-        if (!grap_fmt)
-            return;
-        r->zimgfmt = grap_fmt;
-        r->repack = unpack_pal;
-    } else {
-        enum AVPixelFormat avfmt = imgfmt2pixfmt(r->zimgfmt);
-        if (avfmt == AV_PIX_FMT_MONOWHITE || avfmt == AV_PIX_FMT_MONOBLACK) {
-            r->zimgfmt = IMGFMT_Y1;
-            r->repack = bitmap_repack;
-            r->comp_size = avfmt == AV_PIX_FMT_MONOWHITE; // abuse to pass a flag
-            return;
-        }
-    }
-}
-
-// Tries to set a packer/unpacker for component-wise byte aligned RGB formats.
-static void setup_regular_rgb_packer(struct mp_zimg_repack *r)
-{
-    struct mp_regular_imgfmt desc;
-    if (!mp_get_regular_imgfmt(&desc, r->zimgfmt))
-        return;
-
-    if (desc.num_planes != 1 || desc.planes[0].num_components < 2)
-        return;
-    struct mp_regular_imgfmt_plane *p = &desc.planes[0];
-
-    int num_real_components = 0;
-    bool has_alpha = false;
-    for (int n = 0; n < p->num_components; n++) {
-        if (p->components[n]) {
-            has_alpha |= p->components[n] == 4;
-            num_real_components += 1;
-        } else {
-            // padding must be in MSB or LSB
-            if (n != 0 && n != p->num_components - 1)
-                return;
-        }
-    }
-
-    int depth = desc.component_size * 8 + MPMIN(0, desc.component_pad);
-
-    int planar_fmt = num_real_components > 2
-        ? find_gbrp_format(depth, num_real_components)
-        : find_gray_format(depth, num_real_components);
-    if (!planar_fmt)
-        return;
-    static const int reorder_gbrp[] = {0, 3, 1, 2, 4};
-    static const int reorder_gray[] = {0, 1, 0, 0, 4};
-    const int *reorder = num_real_components > 2 ? reorder_gbrp : reorder_gray;
-
-    for (int i = 0; i < MP_ARRAY_SIZE(regular_repackers); i++) {
-        const struct regular_repacker *pa = &regular_repackers[i];
-
-        // The following may assume little endian (because some repack backends
-        // use word access, while the metadata here uses byte access).
-
-        int prepad = p->components[0] ? 0 : 8;
-        int first_comp = p->components[0] ? 0 : 1;
-        void (*repack_cb)(void *p1, void *p2[], int x0, int x1) =
-            r->pack ? pa->pa_scanline : pa->un_scanline;
-
-        if (pa->packed_width != desc.component_size * p->num_components * 8 ||
-            pa->component_width != depth ||
-            pa->num_components != num_real_components ||
-            pa->prepadding != prepad ||
-            !repack_cb)
-            continue;
-
-        r->repack = packed_repack;
-        r->packed_repack_scanline = repack_cb;
-        r->zimgfmt = planar_fmt;
-        for (int n = 0; n < num_real_components; n++) {
-            // Determine permutation that maps component order between the two
-            // formats, with has_alpha special case (see above).
-            int c = reorder[p->components[first_comp + n]];
-            r->components[n] = c == 4 ? num_real_components - 1 : c - 1;
-        }
-        return;
-    }
-}
-
-// (If native_fmt!=r->fmt.imgfmt, this is the swap-endian case; native_fmt is NE.)
-// (ctx can be NULL for the sake of probing.)
-static bool setup_format_ne(zimg_image_format *zfmt, struct mp_zimg_repack *r,
-                            int native_fmt, struct mp_zimg_context *ctx)
-{
     zimg_image_format_default(zfmt, ZIMG_API_VERSION);
 
+    int rp_flags = 0;
+
+    // For e.g. RGB565, go to lowest depth on pack for less weird dithering.
+    if (r->pack) {
+        rp_flags |= REPACK_CREATE_ROUND_DOWN;
+    } else {
+        rp_flags |= REPACK_CREATE_EXPAND_8BIT;
+    }
+
+    r->repack = mp_repack_create_planar(r->fmt.imgfmt, r->pack, rp_flags);
+    if (!r->repack)
+        return false;
+
+    int align_x = mp_repack_get_align_x(r->repack);
+
+    r->zimgfmt = r->pack ? mp_repack_get_format_src(r->repack)
+                         : mp_repack_get_format_dst(r->repack);
+
+    if (ctx) {
+        talloc_steal(r, r->repack);
+    } else {
+        TA_FREEP(&r->repack);
+    }
+
     struct mp_image_params fmt = r->fmt;
     mp_image_params_guess_csp(&fmt);
 
-    r->zimgfmt = native_fmt;
-
-    if (!r->repack)
-        setup_nv_packer(r);
-    if (!r->repack)
-        setup_misc_packer(r);
-    if (!r->repack)
-        setup_regular_rgb_packer(r);
-    if (!r->repack)
-        setup_fringe_rgb_packer(r, ctx);
-    if (!r->repack)
-        setup_fringe_yuv422_packer(r);
-
     struct mp_regular_imgfmt desc;
     if (!mp_get_regular_imgfmt(&desc, r->zimgfmt))
         return false;
 
+    // Relies on zimg callbacks reading on 64 byte alignment.
+    if (!MP_IS_POWER_OF_2(align_x) || align_x > 64 / desc.component_size)
+        return false;
+
     // no weird stuff
     if (desc.num_planes > 4)
         return false;
 
-    // Endian swapping.
-    if (native_fmt != fmt.imgfmt) {
-        struct mp_regular_imgfmt ndesc;
-        if (!mp_get_regular_imgfmt(&ndesc, native_fmt) || ndesc.num_planes > 4)
-            return false;
-        r->endian_size = ndesc.component_size;
-        if (r->endian_size != 2 && r->endian_size != 4)
-            return false;
-        for (int n = 0; n < ndesc.num_planes; n++)
-            r->endian_items[n] = ndesc.planes[n].num_components;
-    }
-
     for (int n = 0; n < 4; n++)
         r->z_planes[n] = -1;
 
-    // Accept only true planar formats.
     for (int n = 0; n < desc.num_planes; n++) {
         if (desc.planes[n].num_components != 1)
             return false;
@@ -1219,25 +408,6 @@ static bool setup_format_ne(zimg_image_format *zfmt, struct mp_zimg_repack *r,
     return true;
 }
 
-static bool setup_format(zimg_image_format *zfmt, struct mp_zimg_repack *r,
-                         bool pack, struct mp_image_params *fmt,
-                         struct mp_zimg_context *ctx)
-{
-    struct mp_zimg_repack repack_init = {
-        .pack = pack,
-        .fmt = *fmt,
-    };
-    *r = repack_init;
-    if (setup_format_ne(zfmt, r, fmt->imgfmt, ctx))
-        return true;
-    // Try reverse endian.
-    int nimgfmt = mp_find_other_endian(fmt->imgfmt);
-    if (!nimgfmt)
-        return false;
-    *r = repack_init;
-    return setup_format_ne(zfmt, r, nimgfmt, ctx);
-}
-
 static bool allocate_buffer(struct mp_zimg_context *ctx,
                             struct mp_zimg_repack *r)
 {
@@ -1269,20 +439,15 @@ static bool allocate_buffer(struct mp_zimg_context *ctx,
     if (!r->tmp)
         return false;
 
+    // Note: although zimg doesn't require that the chroma plane's zmask is
+    //       divided by the full size zmask, the repack callback requires it,
+    //       since mp_repack can handle only proper slices.
     for (int n = 1; n < r->tmp->fmt.num_planes; n++) {
         r->zmask[n] = r->zmask[0];
         if (r->zmask[0] != ZIMG_BUFFER_MAX)
             r->zmask[n] = r->zmask[n] >> r->tmp->fmt.ys[n];
     }
 
-    if (r->endian_size && !r->pack) {
-        r->tmp_endian = mp_image_alloc(r->fmt.imgfmt, r->fmt.w, h);
-        talloc_steal(r, r->tmp_endian);
-
-        if (!r->tmp_endian)
-            return false;
-    }
-
     return true;
 }
 
@@ -1382,8 +547,12 @@ bool mp_zimg_convert(struct mp_zimg_context *ctx, struct mp_image *dst,
     assert(ctx->zimg_graph);
 
     zimg_image_buffer zsrc, zdst;
-    wrap_buffer(ctx->zimg_src, &zsrc, src);
-    wrap_buffer(ctx->zimg_dst, &zdst, dst);
+    if (!wrap_buffer(ctx->zimg_src, &zsrc, src) ||
+        !wrap_buffer(ctx->zimg_dst, &zdst, dst))
+    {
+        MP_ERR(ctx, "zimg repacker initialization failed.\n");
+        return false;
+    }
 
     // An annoyance.
     zimg_image_buffer_const zsrc_c = {ZIMG_API_VERSION};
@@ -1400,9 +569,6 @@ bool mp_zimg_convert(struct mp_zimg_context *ctx, struct mp_image *dst,
                               repack_entrypoint, ctx->zimg_src,
                               repack_entrypoint, ctx->zimg_dst);
 
-    ctx->zimg_src->user_mpi = NULL;
-    ctx->zimg_dst->user_mpi = NULL;
-
     return true;
 }
 
diff --git a/wscript_build.py b/wscript_build.py
index 2a656a604d..93933e7371 100644
--- a/wscript_build.py
+++ b/wscript_build.py
@@ -402,6 +402,7 @@ def build(ctx):
         ( "test/json.c",                         "tests" ),
         ( "test/linked_list.c",                  "tests" ),
         ( "test/paths.c",                        "tests" ),
+        ( "test/repack.c",                       "tests && zimg" ),
         ( "test/scale_sws.c",                    "tests" ),
         ( "test/scale_test.c",                   "tests" ),
         ( "test/scale_zimg.c",                   "tests && zimg" ),
@@ -529,6 +530,7 @@ def build(ctx):
         ( "video/out/win32/droptarget.c",        "win32-desktop" ),
         ( "video/out/win_state.c"),
         ( "video/out/x11_common.c",              "x11" ),
+        ( "video/repack.c" ),
         ( "video/sws_utils.c" ),
         ( "video/zimg.c",                        "zimg" ),
         ( "video/vaapi.c",                       "vaapi" ),