uppsrc: SSE2->SIMD, Core: another decode fix

git-svn-id: svn://ultimatepp.org/upp/trunk@14652 f0d560ea-af0d-0410-9eb7-867de7ffcac7
This commit is contained in:
cxl 2020-07-03 11:21:07 +00:00
parent df6c02fc5b
commit 988c7c8a7f
6 changed files with 16 additions and 431 deletions

View file

@ -60,18 +60,6 @@ constexpr int findarg(const T& sel, const K& k, const L& ...args)
//$-constexpr auto decode(const T& x, const T1& p0, const V1& v0, ...);
template <class T, class V>
constexpr const V& decode(const T& sel, const V& def)
{
return def;
}
template <class T, class K, class V, typename... L>
constexpr V decode(const T& sel, const K& k, const V& v, const L& ...args)
{
return sel == k ? v : (V)decode(sel, args...);
}
template <class T>
constexpr const char *decode_chr_(const T& sel, const char *def)
{
@ -90,6 +78,18 @@ constexpr const char *decode(const T& sel, const K& k, const char *v, const L& .
return decode_chr_(sel, k, v, args...);
}
template <class T, class V>
constexpr const V& decode(const T& sel, const V& def)
{
return def;
}
template <class T, class K, class V, typename... L>
constexpr V decode(const T& sel, const K& k, const V& v, const L& ...args)
{
return sel == k ? v : (V)decode(sel, args...);
}
//$-constexpr T get_i(int i, const T& p0, const T1& p1, ...);
template <typename A, typename... T>

View file

@ -2,7 +2,7 @@
namespace Upp {
#ifdef CPU_X86
#ifdef CPU_SIMD
void memset8__(void *p, i16x8 data, size_t len)
{

View file

@ -1,5 +1,3 @@
#ifdef CPU_X86
#ifdef CPU_SIMD
force_inline
@ -86,7 +84,7 @@ void StoreRGBAF(RGBA *t, f32x4 s)
force_inline
f32x4 ClampRGBAF(f32x4 p)
{return p;
{
#ifdef PLATFORM_MACOS
f32x4 alpha = Broadcast0(p);
#else
@ -96,100 +94,4 @@ f32x4 ClampRGBAF(f32x4 p)
return min(p, alpha);
}
#else
force_inline
__m128i LoadRGBA(const RGBA *s)
{
return _mm_unpacklo_epi8(_mm_set_epi32(0, 0, 0, *(dword *)s), _mm_setzero_si128());
}
force_inline
__m128i LoadRGBA2(const RGBA& c)
{
return _mm_unpacklo_epi8(_mm_set_epi32(0, 0, *(dword *)&c, *(dword *)&c), _mm_setzero_si128());
}
force_inline
__m128i LoadRGBA2(const RGBA *s0, const RGBA *s1)
{
return _mm_unpacklo_epi8(_mm_set_epi32(0, 0, *(dword *)s1, *(dword *)s0), _mm_setzero_si128());
}
force_inline
__m128i LoadRGBA2(const RGBA *s)
{
return _mm_unpacklo_epi8(_mm_set_epi32(0, 0, *(dword *)(s + 1), *(dword *)s), _mm_setzero_si128());
}
force_inline
__m128i LoadRGBAL(__m128i x)
{
return _mm_unpacklo_epi8(x, _mm_setzero_si128());
}
force_inline
__m128i LoadRGBAH(__m128i x)
{
return _mm_unpackhi_epi8(x, _mm_setzero_si128());
}
force_inline
void LoadRGBA4(const RGBA *s, __m128i& l, __m128i& h)
{
__m128i t4 = _mm_loadu_si128((__m128i *)s);
l = LoadRGBAL(t4);
h = LoadRGBAH(t4);
}
force_inline
__m128i PackRGBA(__m128i l, __m128i h)
{
return _mm_packus_epi16(l, h);
}
force_inline
void StoreRGBA(RGBA *rgba, __m128i x)
{
_mm_store_ss((float *)rgba, _mm_castsi128_ps(PackRGBA(x, _mm_setzero_si128())));
}
force_inline
void StoreRGBA2(RGBA *rgba, __m128i x)
{
_mm_storel_pd((double *)rgba, _mm_castsi128_pd(PackRGBA(x, _mm_setzero_si128())));
}
force_inline
void StoreRGBA4(RGBA *rgba, __m128i l, __m128i h)
{
_mm_storeu_si128((__m128i *)rgba, PackRGBA(l, h));
}
force_inline
__m128 LoadRGBAF(const RGBA *s)
{
return _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_set_epi32(0, 0, 0, *(dword *)s), _mm_setzero_si128()), _mm_setzero_si128()));
}
force_inline
void StoreRGBAF(RGBA *t, __m128 s)
{
_mm_store_ss((float *)t, _mm_castsi128_ps(
_mm_packus_epi16(
_mm_packs_epi32(_mm_cvttps_epi32(s), _mm_setzero_si128()),
_mm_setzero_si128()
)));
}
force_inline
__m128 ClampRGBAF(__m128 p)
{ // TODO: MacOS
__m128 alpha = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3));
alpha = _mm_min_ps(alpha, _mm_set1_ps(255.0));
return _mm_min_ps(p, alpha);
}
#endif
#endif

View file

@ -7,8 +7,6 @@ void AlphaBlend(RGBA *t, const RGBA& c, int alpha);
void AlphaBlend(RGBA *t, const RGBA *s, int alpha, int len);
void AlphaBlend(RGBA *t, const RGBA& c, int alpha, int len);
#if defined(CPU_SSE2) && !defined(flagNOSIMD)
#ifdef CPU_SIMD
force_inline
@ -148,146 +146,6 @@ void AlphaBlend(RGBA *t, const RGBA *s, int alpha, int len)
#else
force_inline
__m128i BroadcastAlpha(__m128i x)
{
#ifdef PLATFORM_MACOS
return _mm_shufflelo_epi16(_mm_shufflehi_epi16(x, 0x00), 0x00);
#else
return _mm_shufflelo_epi16(_mm_shufflehi_epi16(x, 0xff), 0xff);
#endif
}
force_inline
__m128i Mul8(__m128i x, int alpha)
{
return _mm_srli_epi16(_mm_mullo_epi16(_mm_set1_epi16(alpha), x), 8); // c.a * alpha >> 8;
}
force_inline
__m128i MakeAlpha(__m128i x)
{
x = BroadcastAlpha(x);
#ifdef PLATFORM_MACOS
x = _mm_srli_epi16(_mm_mullo_epi16(_mm_set_epi16(129, 129, 129, 128, 129, 129, 129, 128), x), 7); // a for alpha, 256*a/255 for color
#else
x = _mm_srli_epi16(_mm_mullo_epi16(_mm_set_epi16(128, 129, 129, 129, 128, 129, 129, 129), x), 7); // a for alpha, 256*a/255 for color
#endif
x = _mm_sub_epi16(_mm_set1_epi16(256), x); // 256 - a for alpha, 256 - 256*a/255 for color;
return x;
}
force_inline
__m128i AlphaBlendSSE2(__m128i t, __m128i s, __m128i alpha)
{
return _mm_adds_epi16(s, _mm_srli_epi16(_mm_mullo_epi16(t, alpha), 8)); // t = c + (t * alpha >> 8);
}
force_inline
void AlphaBlend1(RGBA *t, __m128i s, __m128i alpha)
{
StoreRGBA(t, AlphaBlendSSE2(LoadRGBA(t), s, alpha));
}
force_inline
void AlphaBlend2(RGBA *t, __m128i s, __m128i alpha)
{
StoreRGBA2(t, AlphaBlendSSE2(LoadRGBA2(t), s, alpha));
}
force_inline
void AlphaBlend4(RGBA *t, __m128i sl, __m128i al, __m128i sh, __m128i ah)
{
__m128i t4 = _mm_loadu_si128((__m128i *)t);
_mm_storeu_si128((__m128i *)t,
PackRGBA(
AlphaBlendSSE2(LoadRGBAL(t4), sl, al),
AlphaBlendSSE2(LoadRGBAH(t4), sh, ah)));
}
force_inline
void AlphaBlend(RGBA *t, const RGBA& c)
{
__m128i s = LoadRGBA(&c);
StoreRGBA(t, AlphaBlendSSE2(LoadRGBA(t), s, MakeAlpha(s)));
}
force_inline
void AlphaBlend(RGBA *t, const RGBA& c, int alpha)
{
__m128i s = Mul8(LoadRGBA(&c), alpha);
StoreRGBA(t, AlphaBlendSSE2(LoadRGBA(t), s, MakeAlpha(s)));
}
force_inline
void AlphaBlend(RGBA *t, const RGBA& c, int alpha, int len)
{
__m128i s = Mul8(LoadRGBA2(c), alpha);
__m128i a = MakeAlpha(s);
while(len >= 4) {
AlphaBlend4(t, s, a, s, a);
t += 4;
len -= 4;
}
if(len & 2) {
AlphaBlend2(t, s, a);
t += 2;
}
if(len & 1)
AlphaBlend1(t, s, a);
}
force_inline
void AlphaBlend(RGBA *t, const RGBA *s, int alpha, int len)
{
if(alpha == 256) {
while(len >= 4) {
__m128i m = _mm_loadu_si128((__m128i *)s);
__m128i s0 = LoadRGBAL(m);
__m128i s1 = LoadRGBAH(m);
AlphaBlend4(t, s0, MakeAlpha(s0), s1, MakeAlpha(s1));
t += 4;
s += 4;
len -= 4;
}
if(len & 2) {
__m128i s0 = LoadRGBA2(s);
AlphaBlend2(t, s0, MakeAlpha(s0));
t += 2;
s += 2;
}
if(len & 1) {
__m128i s0 = LoadRGBA(s);
AlphaBlend1(t, s0, MakeAlpha(s0));
}
}
else {
while(len >= 4) {
__m128i m = _mm_loadu_si128((__m128i *)s);
__m128i s0 = Mul8(LoadRGBAL(m), alpha);
__m128i s1 = Mul8(LoadRGBAH(m), alpha);
AlphaBlend4(t, s0, MakeAlpha(s0), s1, MakeAlpha(s1));
t += 4;
s += 4;
len -= 4;
}
if(len & 2) {
__m128i s0 = Mul8(LoadRGBA2(s), alpha);
AlphaBlend2(t, s0, MakeAlpha(s0));
t += 2;
s += 2;
}
if(len & 1) {
__m128i s0 = Mul8(LoadRGBA(s), alpha);
AlphaBlend1(t, s0, MakeAlpha(s0));
}
}
}
#endif
#else
force_inline
void AlphaBlend__(RGBA& t, const RGBA& c)
{

View file

@ -178,182 +178,6 @@ struct PainterImageSpan : SpanSource, PainterImageSpanData {
}
};
#if 0
force_inline
int IntAndFraction(__m128 x, __m128& fraction)
{
x = _mm_add_ps(x, _mm_set1_ps(8000)); // cvttps truncates toward 0, need to fix negatives
__m128i m = _mm_cvttps_epi32(x);
fraction = _mm_sub_ps(x, _mm_cvtepi32_ps(m));
return _mm_cvtsi128_si32(m) - 8000;
}
force_inline
int Int(__m128 x)
{
return _mm_cvtsi128_si32(_mm_cvttps_epi32(_mm_add_ps(x, _mm_set1_ps(8000)))) - 8000;
}
struct PainterImageSpanData {
int ax, ay, cx, cy, maxx, maxy;
byte style;
byte hstyle, vstyle;
bool fast;
bool fixed;
Image image;
Xform2D xform;
PainterImageSpanData(dword flags, const Xform2D& m, const Image& img, bool co, bool imagecache) {
style = byte(flags & 15);
hstyle = byte(flags & 3);
vstyle = byte(flags & 12);
fast = flags & FILL_FAST;
image = img;
int nx = 1;
int ny = 1;
if(!fast) {
Pointf sc = m.GetScaleXY();
if(sc.x >= 0.01 && sc.y >= 0.01) {
nx = (int)max(1.0, 1.0 / sc.x);
ny = (int)max(1.0, 1.0 / sc.y);
}
}
if(nx == 1 && ny == 1)
xform = Inverse(m);
else {
if(!fast)
image = (imagecache ? MinifyCached : Minify)(image, nx, ny, co);
xform = Inverse(m) * Xform2D::Scale(1.0 / nx, 1.0 / ny);
}
cx = image.GetWidth();
cy = image.GetHeight();
maxx = cx - 1;
maxy = cy - 1;
ax = 6000000 / cx * cx * 2;
ay = 6000000 / cy * cy * 2;
}
PainterImageSpanData() {}
};
struct PainterImageSpan : SpanSource, PainterImageSpanData {
PainterImageSpan(const PainterImageSpanData& f)
: PainterImageSpanData(f) {}
const RGBA *Pixel(int x, int y) { return &image[y][x]; }
const RGBA *GetPixel(int x, int y) {
if(hstyle == FILL_HPAD)
x = minmax(x, 0, maxx);
else
if(hstyle == FILL_HREFLECT)
x = (x + ax) / cx & 1 ? (ax - x - 1) % cx : (x + ax) % cx;
else
if(hstyle == FILL_HREPEAT)
x = (x + ax) % cx;
if(vstyle == FILL_VPAD)
y = minmax(y, 0, maxy);
else
if(vstyle == FILL_VREFLECT)
y = (y + ay) / cy & 1 ? (ay - y - 1) % cy : (y + ay) % cy;
else
if(vstyle == FILL_VREPEAT)
y = (y + ay) % cy;
static RGBA zero;
return fixed || (x >= 0 && x < cx && y >= 0 && y < cy) ? &image[y][x] : &zero;
}
virtual void Get(RGBA *span, int x, int y, unsigned len)
{
PAINTER_TIMING("ImageSpan::Get");
Pointf p0 = xform.Transform(Pointf(x, y));
Pointf dd = xform.Transform(Pointf(x + 1, y)) - p0;
__m128 x0 = _mm_set_ps1((float)p0.x);
__m128 y0 = _mm_set_ps1((float)p0.y);
__m128 dx = _mm_set_ps1((float)dd.x);
__m128 dy = _mm_set_ps1((float)dd.y);
__m128 ii = _mm_setzero_ps();
__m128 v1 = _mm_set_ps1(1);
__m128 ix, iy;
auto GetIXY = [&] {
ix = _mm_add_ps(x0, _mm_mul_ps(ii, dx));
iy = _mm_add_ps(y0, _mm_mul_ps(ii, dy));
ii = _mm_add_ps(ii, v1);
};
fixed = hstyle && vstyle;
if(hstyle + vstyle == 0 && fast) {
while(len--) {
GetIXY();
Point l(Int(ix), Int(iy));
if(l.x > 0 && l.x < maxx && l.y > 0 && l.y < maxy)
*span = *Pixel(l.x, l.y);
else
if(style == 0 && (l.x < -1 || l.x > cx || l.y < -1 || l.y > cy))
*span = RGBAZero();
else
*span = *GetPixel(l.x, l.y);
++span;
}
return;
}
while(len--) {
GetIXY();
__m128 fx, fy;
Point l(IntAndFraction(ix, fx), IntAndFraction(iy, fy));
if(hstyle == FILL_HREPEAT)
l.x = (l.x + ax) % cx;
if(vstyle == FILL_VREPEAT)
l.y = (l.y + ay) % cy;
if(style == 0 && (l.x < -1 || l.x > cx || l.y < -1 || l.y > cy))
*span = RGBAZero();
else
if(fast) {
if(l.x > 0 && l.x < maxx && l.y > 0 && l.y < maxy)
*span = *Pixel(l.x, l.y);
else
*span = *GetPixel(l.x, l.y);
}
else {
__m128 p00, p01, p10, p11;
if(l.x > 0 && l.x < maxx && l.y > 0 && l.y < maxy) {
p00 = LoadRGBAF(Pixel(l.x + 0, l.y + 0));
p01 = LoadRGBAF(Pixel(l.x + 0, l.y + 1));
p10 = LoadRGBAF(Pixel(l.x + 1, l.y + 0));
p11 = LoadRGBAF(Pixel(l.x + 1, l.y + 1));
}
else {
p00 = LoadRGBAF(GetPixel(l.x + 0, l.y + 0));
p01 = LoadRGBAF(GetPixel(l.x + 0, l.y + 1));
p10 = LoadRGBAF(GetPixel(l.x + 1, l.y + 0));
p11 = LoadRGBAF(GetPixel(l.x + 1, l.y + 1));
}
p01 = _mm_mul_ps(p01, fy);
p11 = _mm_mul_ps(p11, fy);
p10 = _mm_mul_ps(p10, fx);
p11 = _mm_mul_ps(p11, fx);
fx = _mm_sub_ps(v1, fx);
fy = _mm_sub_ps(v1, fy);
p00 = _mm_mul_ps(p00, fy);
p10 = _mm_mul_ps(p10, fy);
p00 = _mm_mul_ps(p00, fx);
p01 = _mm_mul_ps(p01, fx);
StoreRGBAF(span, _mm_add_ps(p00, _mm_add_ps(p01, _mm_add_ps(p10, p11))));
}
++span;
}
}
};
#endif
void BufferPainter::RenderImage(double width, const Image& image, const Xform2D& transsrc, dword flags)
{
current = Null;

View file

@ -247,7 +247,6 @@ bool RichObject::Set(const String& _type_name, const Value& _data, Size maxsiz
bool RichObject::Read(const String& _type_name, const String& _data, Size sz, void *context)
{
NewSerial();
type_name = _type_name;
RichObjectType *t = Map().Get(type_name, NULL);
if(t) {
@ -258,10 +257,12 @@ bool RichObject::Read(const String& _type_name, const String& _data, Size sz,
pixel_size = type->GetPixelSize(data, context);
AdjustPhysicalSize();
size = sz;
NewSerial();
return true;
}
data = _data;
physical_size = pixel_size = size = sz;
NewSerial();
return false;
}