/////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2000 Tom Barry  All rights reserved.
/////////////////////////////////////////////////////////////////////////////
//
//  This file is subject to the terms of the GNU General Public License as
//  published by the Free Software Foundation.  A copy of this license is
//  included with this software distribution in the file COPYING.  If you
//  do not have a copy, you may obtain a copy by writing to the Free
//  Software Foundation, 51 Franklin Steet, Fifth Floor, Cambridge, MA 02110-1301, USA.
//
//  This software is distributed in the hope that it will be useful,
//  but WITHOUT ANY WARRANTY; without even the implied warranty of
//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//  GNU General Public License for more details
/////////////////////////////////////////////////////////////////////////////

// This is a simple lightweight DeInterlace method that uses little CPU time
// but gives very good results for low or intermedite motion.
// It defers frames by one field, but that does not seem to produce noticeable
// lip sync problems.
//
// The method used is to take either the older or newer weave pixel depending
// upon which give the smaller comb factor, and then clip to avoid large damage
// when wrong.
//
// I'd intended this to be part of a larger more elaborate method added to
// Blended Clip but this give too good results for the CPU to ignore here.

#if defined(IS_SSE)
void DScalerFilterGreedy::filterDScaler_SSE(TDeinterlaceInfo* pInfo)
#elif defined(IS_3DNOW)
void DScalerFilterGreedy::filterDScaler_3DNOW(TDeinterlaceInfo* pInfo)
#else
void DScalerFilterGreedy::filterDScaler_MMX(TDeinterlaceInfo* pInfo)
#endif
{
    int Line;
    long LoopCtr;
    unsigned char* L1;                  // ptr to Line1, of 3
    unsigned char* L2;                  // ptr to Line2, the weave line
    unsigned char* L3;                  // ptr to Line3
    unsigned char* LP2;                 // ptr to prev Line2
    unsigned char* Dest = pInfo->Overlay;
    unsigned int Pitch = pInfo->InputPitch;

#ifdef IS_MMX
    int64_t ShiftMask = 0xfefffefffefffeffull;   // to avoid shifting chroma to luma
#endif
    int64_t MaxComb;
    int64_t i;

    i = _maxComb;          // How badly do we let it weave? 0-255
    MaxComb = i << 56 | i << 48 | i << 40 | i << 32 | i << 24 | i << 16 | i << 8 | i;

    // copy first even line no matter what, and the first odd line if we're
    // processing an EVEN field. (note diff from other deint rtns.)

    if(pInfo->PictureHistory[0]->Flags & PICTURE_INTERLACED_ODD) {
        L1 = pInfo->PictureHistory[1]->pData;
        L2 = pInfo->PictureHistory[0]->pData;
        L3 = L1 + Pitch;
        LP2 = pInfo->PictureHistory[2]->pData;

        // copy first even line
        pInfo->pMemcpy(Dest, L1, pInfo->LineLength);
        Dest += pInfo->OverlayPitch;
    } else {
        L1 = pInfo->PictureHistory[1]->pData;
        L2 = pInfo->PictureHistory[0]->pData + Pitch;
        L3 = L1 + Pitch;
        LP2 = pInfo->PictureHistory[2]->pData + Pitch;

        // copy first even line
        pInfo->pMemcpy(Dest, pInfo->PictureHistory[0]->pData, pInfo->LineLength);
        Dest += pInfo->OverlayPitch;
        // then first odd line
        pInfo->pMemcpy(Dest, L1, pInfo->LineLength);
        Dest += pInfo->OverlayPitch;
    }

    for (Line = 0; Line < (pInfo->FieldHeight - 1); ++Line) {
        LoopCtr = pInfo->LineLength / 8;             // there are LineLength / 8 qwords per line

        // For ease of reading, the comments below assume that we're operating on an odd
        // field (i.e., that pInfo->IsOdd is true).  Assume the obvious for even lines..

        __asm__ __volatile__
            (
             MOVX"    %[L1],    %%"XAX"\n\t"
             MOVX"    %[L2],    %%"XCX"\n\t"
             MOVX"    %[L3],    %%"XDX"\n\t"
             MOVX"    %[LP2],   %%"XSI"\n\t"
             MOVX"    %[Dest],  %%"XDI"\n\t"  // DL1 if Odd or DL2 if Even

             ".align 8\n\t"
             "1:\n\t"

             "movq    (%%"XAX"), %%mm1\n\t"  // L1
             "movq    (%%"XCX"), %%mm2\n\t"  // L2
             "movq    (%%"XDX"), %%mm3\n\t"  // L3
             "movq    (%%"XSI"), %%mm0\n\t"  // LP2

             // average L1 and L3 leave result in mm4
             "movq    %%mm1,   %%mm4\n\t"    // L1
#if defined(IS_SSE)
             "pavgb   %%mm3,   %%mm4\n\t"
#elif defined(IS_3DNOW)
             "pavgusb %%mm3,   %%mm4\n\t"
#else
             "pand    %[ShiftMask], %%mm4\n\t"    // "
             "psrlw   $1,           %%mm4\n\t"
             "movq    %%mm3,        %%mm5\n\t"    // L3
             "pand    %[ShiftMask], %%mm5\n\t"    // "
             "psrlw   $1,           %%mm5\n\t"
             "paddb   %%mm5,        %%mm4\n\t"    // the average, for computing comb
#endif

             // get abs value of possible L2 comb
             "movq    %%mm2,   %%mm2\n\t"    // L2
             "psubusb %%mm4,   %%mm7\n\t"    // L2 - avg
             "movq    %%mm4,   %%mm5\n\t"    // avg
             "psubusb %%mm2,   %%mm5\n\t"    // avg - L2
             "por     %%mm7,   %%mm5\n\t"    // abs(avg-L2)
             "movq    %%mm4,   %%mm6\n\t"    // copy of avg for later

             // get abs value of possible LP2 comb
             "movq    %%mm0,   %%mm7\n\t"    // LP2
             "psubusb %%mm4,   %%mm7\n\t"    // LP2 - avg
             "psubusb %%mm0,   %%mm4\n\t"    // avg - LP2
             "por     %%mm7,   %%mm4\n\t"    // abs(avg-LP2)

             // use L2 or LP2 depending upon which makes smaller comb
             "psubusb %%mm5,   %%mm4\n\t"    // see if it goes to zero
             "psubusb %%mm5,   %%mm5\n\t"    // 0
             "pcmpeqb %%mm5,   %%mm4\n\t"    // if (mm4=0) then FF else 0
             "pcmpeqb %%mm4,   %%mm5\n\t"    // opposite of mm4

             // if Comb(LP2) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55
             "pand    %%mm2,   %%mm5\n\t"    // use L2 if mm5 == ff, else 0
             "pand    %%mm0,   %%mm4\n\t"    // use LP2 if mm4 = ff, else 0
             "por     %%mm5,   %%mm4\n\t"    // may the best win

             // Now lets clip our chosen value to be not outside of the range
             // of the high/low range L1-L3 by more than abs(L1-L3)
             // This allows some comb but limits the damages and also allows more
             // detail than a boring oversmoothed clip.

             "movq    %%mm1,   %%mm2\n\t"    // copy L1
             "psubusb %%mm3,   %%mm2\n\t"    // - L3, with saturation
             "paddusb %%mm3,   %%mm2\n\t"    // now = Max(L1,L3)

             "pcmpeqb %%mm7,   %%mm7\n\t"    // all ffffffff
             "psubusb %%mm1,   %%mm7\n\t"    // - L1
             "paddusb %%mm7,   %%mm3\n\t"    // add, may sat at fff..
             "psubusb %%mm7,   %%mm3\n\t"    // now = Min(L1,L3)

             // allow the value to be above the high or below the low by amt of MaxComb
             "paddusb %[MaxComb], %%mm2\n\t" // increase max by diff
             "psubusb %[MaxComb], %%mm3\n\t" // lower min by diff

             "psubusb %%mm3,   %%mm4\n\t"    // best - Min
             "paddusb %%mm3,   %%mm4\n\t"    // now = Max(best,Min(L1,L3)

             "pcmpeqb %%mm7,   %%mm7\n\t"    // all ffffffff
             "psubusb %%mm4,   %%mm7\n\t"    // - Max(best,Min(best,L3)
             "paddusb %%mm7,   %%mm2\n\t"    // add may sat at FFF..
             "psubusb %%mm7,   %%mm2\n\t"    // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped

#ifdef IS_SSE
             "movntq  %%mm2,   (%%"XDI")\n\t"  // move in our clipped best
#else
             "movq    %%mm2,   (%%"XDI")\n\t"  // move in our clipped best
#endif

             // bump ptrs and loop
             LEAX"    8(%%"XAX"), %%"XAX"\n\t"
             LEAX"    8(%%"XCX"), %%"XCX"\n\t"
             LEAX"    8(%%"XDX"), %%"XDX"\n\t"
             LEAX"    8(%%"XDI"), %%"XDI"\n\t"
             LEAX"    8(%%"XSI"), %%"XSI"\n\t"

             DECX"    %[LoopCtr]\n\t"
             "jnz     1b\n\t"

             : /* no outputs */

             : [L1]      "m"(L1),
               [L2]      "m"(L2),
               [L3]      "m"(L3),
               [LP2]     "m"(LP2),
               [Dest]    "m"(Dest),
               [MaxComb] "m"(MaxComb),
               [LoopCtr] "m"(LoopCtr)
#if defined(IS_MMX)
               ,[ShiftMask] "m"(ShiftMask)
#endif

             : XAX, XCX, XDX, XSI, XDI,
#ifdef ARCH_386
               "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)",
#endif
               "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7",
               "memory", "cc"
             );

        Dest += pInfo->OverlayPitch;
        pInfo->pMemcpy(Dest, L3, pInfo->LineLength);
        Dest += pInfo->OverlayPitch;

        L1 += Pitch;
        L2 += Pitch;
        L3 += Pitch;
        LP2 += Pitch;
    }

    // Copy last odd line if we're processing an Odd field.
    if(pInfo->PictureHistory[0]->Flags & PICTURE_INTERLACED_ODD) {
        pInfo->pMemcpy(Dest, L2, pInfo->LineLength);
    }

    // clear out the MMX registers ready for doing floating point
    // again
#ifdef ARCH_386
    __asm__ __volatile__("emms");
#endif
}
