> fp_motion_vector_info_;
};
} // namespace vp9
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.c b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.c
index 2ca2114ec..a73683dfe 100644
--- a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.c
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.c
@@ -13,6 +13,7 @@
#include "./vpx_config.h"
#include "vpx/vpx_encoder.h"
+#include "vpx/vpx_ext_ratectrl.h"
#include "vpx_dsp/psnr.h"
#include "vpx_ports/vpx_once.h"
#include "vpx_ports/static_assert.h"
@@ -355,13 +356,14 @@ static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx,
switch (img->fmt) {
case VPX_IMG_FMT_YV12:
case VPX_IMG_FMT_I420:
- case VPX_IMG_FMT_I42016: break;
+ case VPX_IMG_FMT_I42016:
+ case VPX_IMG_FMT_NV12: break;
case VPX_IMG_FMT_I422:
case VPX_IMG_FMT_I444:
case VPX_IMG_FMT_I440:
if (ctx->cfg.g_profile != (unsigned int)PROFILE_1) {
ERROR(
- "Invalid image format. I422, I444, I440 images are "
+ "Invalid image format. I422, I444, I440, NV12 images are "
"not supported in profile.");
}
break;
@@ -391,6 +393,7 @@ static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx,
static int get_image_bps(const vpx_image_t *img) {
switch (img->fmt) {
case VPX_IMG_FMT_YV12:
+ case VPX_IMG_FMT_NV12:
case VPX_IMG_FMT_I420: return 12;
case VPX_IMG_FMT_I422: return 16;
case VPX_IMG_FMT_I444: return 24;
@@ -468,10 +471,11 @@ static vpx_rational64_t get_g_timebase_in_ts(vpx_rational_t g_timebase) {
}
static vpx_codec_err_t set_encoder_config(
- VP9EncoderConfig *oxcf, const vpx_codec_enc_cfg_t *cfg,
+ VP9EncoderConfig *oxcf, vpx_codec_enc_cfg_t *cfg,
const struct vp9_extracfg *extra_cfg) {
const int is_vbr = cfg->rc_end_usage == VPX_VBR;
int sl, tl;
+ unsigned int raw_target_rate;
oxcf->profile = cfg->g_profile;
oxcf->max_threads = (int)cfg->g_threads;
oxcf->width = cfg->g_w;
@@ -498,8 +502,14 @@ static vpx_codec_err_t set_encoder_config(
cfg->g_pass == VPX_RC_FIRST_PASS ? 0 : cfg->g_lag_in_frames;
oxcf->rc_mode = cfg->rc_end_usage;
+ raw_target_rate =
+ (unsigned int)((int64_t)oxcf->width * oxcf->height * oxcf->bit_depth * 3 *
+ oxcf->init_framerate / 1000);
+ // Cap target bitrate to raw rate
+ cfg->rc_target_bitrate = VPXMIN(raw_target_rate, cfg->rc_target_bitrate);
+
// Convert target bandwidth from Kbit/s to Bit/s
- oxcf->target_bandwidth = 1000 * cfg->rc_target_bitrate;
+ oxcf->target_bandwidth = 1000 * (int64_t)cfg->rc_target_bitrate;
oxcf->rc_max_intra_bitrate_pct = extra_cfg->rc_max_intra_bitrate_pct;
oxcf->rc_max_inter_bitrate_pct = extra_cfg->rc_max_inter_bitrate_pct;
oxcf->gf_cbr_boost_pct = extra_cfg->gf_cbr_boost_pct;
@@ -624,7 +634,7 @@ static vpx_codec_err_t set_encoder_config(
}
if (get_level_index(oxcf->target_level) >= 0) config_target_level(oxcf);
- // vp9_dump_encoder_config(oxcf);
+ // vp9_dump_encoder_config(oxcf, stderr);
return VPX_CODEC_OK;
}
@@ -698,6 +708,10 @@ static vpx_codec_err_t ctrl_set_cpuused(vpx_codec_alg_priv_t *ctx,
extra_cfg.cpu_used = CAST(VP8E_SET_CPUUSED, args);
extra_cfg.cpu_used = VPXMIN(9, extra_cfg.cpu_used);
extra_cfg.cpu_used = VPXMAX(-9, extra_cfg.cpu_used);
+#if CONFIG_REALTIME_ONLY
+ if (extra_cfg.cpu_used > -5 && extra_cfg.cpu_used < 5)
+ extra_cfg.cpu_used = (extra_cfg.cpu_used > 0) ? 5 : -5;
+#endif
return update_extra_cfg(ctx, &extra_cfg);
}
@@ -1559,6 +1573,7 @@ static vpx_codec_err_t ctrl_set_svc_parameters(vpx_codec_alg_priv_t *ctx,
lc->scaling_factor_num = params->scaling_factor_num[sl];
lc->scaling_factor_den = params->scaling_factor_den[sl];
lc->speed = params->speed_per_layer[sl];
+ lc->loopfilter_ctrl = params->loopfilter_ctrl[sl];
}
}
@@ -1703,6 +1718,48 @@ static vpx_codec_err_t ctrl_set_postencode_drop(vpx_codec_alg_priv_t *ctx,
return VPX_CODEC_OK;
}
+static vpx_codec_err_t ctrl_set_disable_overshoot_maxq_cbr(
+ vpx_codec_alg_priv_t *ctx, va_list args) {
+ VP9_COMP *const cpi = ctx->cpi;
+ const unsigned int data = va_arg(args, unsigned int);
+ cpi->rc.disable_overshoot_maxq_cbr = data;
+ return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_set_disable_loopfilter(vpx_codec_alg_priv_t *ctx,
+ va_list args) {
+ VP9_COMP *const cpi = ctx->cpi;
+ const unsigned int data = va_arg(args, unsigned int);
+ cpi->loopfilter_ctrl = data;
+ return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_set_external_rate_control(vpx_codec_alg_priv_t *ctx,
+ va_list args) {
+ vpx_rc_funcs_t funcs = *CAST(VP9E_SET_EXTERNAL_RATE_CONTROL, args);
+ VP9_COMP *cpi = ctx->cpi;
+ EXT_RATECTRL *ext_ratectrl = &cpi->ext_ratectrl;
+ const VP9EncoderConfig *oxcf = &cpi->oxcf;
+ // TODO(angiebird): Check the possibility of this flag being set at pass == 1
+ if (oxcf->pass == 2) {
+ const FRAME_INFO *frame_info = &cpi->frame_info;
+ vpx_rc_config_t ratectrl_config;
+
+ ratectrl_config.frame_width = frame_info->frame_width;
+ ratectrl_config.frame_height = frame_info->frame_height;
+ ratectrl_config.show_frame_count = cpi->twopass.first_pass_info.num_frames;
+
+ // TODO(angiebird): Double check whether this is the proper way to set up
+ // target_bitrate and frame_rate.
+ ratectrl_config.target_bitrate_kbps = (int)(oxcf->target_bandwidth / 1000);
+ ratectrl_config.frame_rate_num = oxcf->g_timebase.den;
+ ratectrl_config.frame_rate_den = oxcf->g_timebase.num;
+
+ vp9_extrc_create(funcs, ratectrl_config, ext_ratectrl);
+ }
+ return VPX_CODEC_OK;
+}
+
static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
{ VP8_COPY_REFERENCE, ctrl_copy_reference },
@@ -1747,12 +1804,15 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
{ VP9E_SET_TARGET_LEVEL, ctrl_set_target_level },
{ VP9E_SET_ROW_MT, ctrl_set_row_mt },
{ VP9E_SET_POSTENCODE_DROP, ctrl_set_postencode_drop },
+ { VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR, ctrl_set_disable_overshoot_maxq_cbr },
{ VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, ctrl_enable_motion_vector_unit_test },
{ VP9E_SET_SVC_INTER_LAYER_PRED, ctrl_set_svc_inter_layer_pred },
{ VP9E_SET_SVC_FRAME_DROP_LAYER, ctrl_set_svc_frame_drop_layer },
{ VP9E_SET_SVC_GF_TEMPORAL_REF, ctrl_set_svc_gf_temporal_ref },
{ VP9E_SET_SVC_SPATIAL_LAYER_SYNC, ctrl_set_svc_spatial_layer_sync },
{ VP9E_SET_DELTA_Q_UV, ctrl_set_delta_q_uv },
+ { VP9E_SET_DISABLE_LOOPFILTER, ctrl_set_disable_loopfilter },
+ { VP9E_SET_EXTERNAL_RATE_CONTROL, ctrl_set_external_rate_control },
// Getters
{ VP8E_GET_LAST_QUANTIZER, ctrl_get_quantizer },
@@ -1886,7 +1946,7 @@ static vp9_extracfg get_extra_cfg() {
VP9EncoderConfig vp9_get_encoder_config(int frame_width, int frame_height,
vpx_rational_t frame_rate,
- int target_bitrate,
+ int target_bitrate, int encode_speed,
vpx_enc_pass enc_pass) {
/* This function will generate the same VP9EncoderConfig used by the
* vpxenc command given below.
@@ -1897,6 +1957,7 @@ VP9EncoderConfig vp9_get_encoder_config(int frame_width, int frame_height,
* HEIGHT: frame_height
* FPS: frame_rate
* BITRATE: target_bitrate
+ * CPU_USED:encode_speed
*
* INPUT, OUTPUT, LIMIT will not affect VP9EncoderConfig
*
@@ -1908,9 +1969,10 @@ VP9EncoderConfig vp9_get_encoder_config(int frame_width, int frame_height,
* BITRATE=600
* FPS=30/1
* LIMIT=150
+ * CPU_USED=0
* ./vpxenc --limit=$LIMIT --width=$WIDTH --height=$HEIGHT --fps=$FPS
* --lag-in-frames=25 \
- * --codec=vp9 --good --cpu-used=0 --threads=0 --profile=0 \
+ * --codec=vp9 --good --cpu-used=CPU_USED --threads=0 --profile=0 \
* --min-q=0 --max-q=63 --auto-alt-ref=1 --passes=2 --kf-max-dist=150 \
* --kf-min-dist=0 --drop-frame=0 --static-thresh=0 --bias-pct=50 \
* --minsection-pct=0 --maxsection-pct=150 --arnr-maxframes=7 --psnr \
@@ -1933,49 +1995,50 @@ VP9EncoderConfig vp9_get_encoder_config(int frame_width, int frame_height,
oxcf.tile_columns = 0;
oxcf.frame_parallel_decoding_mode = 0;
oxcf.two_pass_vbrmax_section = 150;
+ oxcf.speed = abs(encode_speed);
return oxcf;
}
-#define DUMP_STRUCT_VALUE(struct, value) \
- printf(#value " %" PRId64 "\n", (int64_t)(struct)->value)
+#define DUMP_STRUCT_VALUE(fp, structure, value) \
+ fprintf(fp, #value " %" PRId64 "\n", (int64_t)(structure)->value)
-void vp9_dump_encoder_config(const VP9EncoderConfig *oxcf) {
- DUMP_STRUCT_VALUE(oxcf, profile);
- DUMP_STRUCT_VALUE(oxcf, bit_depth);
- DUMP_STRUCT_VALUE(oxcf, width);
- DUMP_STRUCT_VALUE(oxcf, height);
- DUMP_STRUCT_VALUE(oxcf, input_bit_depth);
- DUMP_STRUCT_VALUE(oxcf, init_framerate);
+void vp9_dump_encoder_config(const VP9EncoderConfig *oxcf, FILE *fp) {
+ DUMP_STRUCT_VALUE(fp, oxcf, profile);
+ DUMP_STRUCT_VALUE(fp, oxcf, bit_depth);
+ DUMP_STRUCT_VALUE(fp, oxcf, width);
+ DUMP_STRUCT_VALUE(fp, oxcf, height);
+ DUMP_STRUCT_VALUE(fp, oxcf, input_bit_depth);
+ DUMP_STRUCT_VALUE(fp, oxcf, init_framerate);
// TODO(angiebird): dump g_timebase
// TODO(angiebird): dump g_timebase_in_ts
- DUMP_STRUCT_VALUE(oxcf, target_bandwidth);
+ DUMP_STRUCT_VALUE(fp, oxcf, target_bandwidth);
- DUMP_STRUCT_VALUE(oxcf, noise_sensitivity);
- DUMP_STRUCT_VALUE(oxcf, sharpness);
- DUMP_STRUCT_VALUE(oxcf, speed);
- DUMP_STRUCT_VALUE(oxcf, rc_max_intra_bitrate_pct);
- DUMP_STRUCT_VALUE(oxcf, rc_max_inter_bitrate_pct);
- DUMP_STRUCT_VALUE(oxcf, gf_cbr_boost_pct);
+ DUMP_STRUCT_VALUE(fp, oxcf, noise_sensitivity);
+ DUMP_STRUCT_VALUE(fp, oxcf, sharpness);
+ DUMP_STRUCT_VALUE(fp, oxcf, speed);
+ DUMP_STRUCT_VALUE(fp, oxcf, rc_max_intra_bitrate_pct);
+ DUMP_STRUCT_VALUE(fp, oxcf, rc_max_inter_bitrate_pct);
+ DUMP_STRUCT_VALUE(fp, oxcf, gf_cbr_boost_pct);
- DUMP_STRUCT_VALUE(oxcf, mode);
- DUMP_STRUCT_VALUE(oxcf, pass);
+ DUMP_STRUCT_VALUE(fp, oxcf, mode);
+ DUMP_STRUCT_VALUE(fp, oxcf, pass);
// Key Framing Operations
- DUMP_STRUCT_VALUE(oxcf, auto_key);
- DUMP_STRUCT_VALUE(oxcf, key_freq);
+ DUMP_STRUCT_VALUE(fp, oxcf, auto_key);
+ DUMP_STRUCT_VALUE(fp, oxcf, key_freq);
- DUMP_STRUCT_VALUE(oxcf, lag_in_frames);
+ DUMP_STRUCT_VALUE(fp, oxcf, lag_in_frames);
// ----------------------------------------------------------------
// DATARATE CONTROL OPTIONS
// vbr, cbr, constrained quality or constant quality
- DUMP_STRUCT_VALUE(oxcf, rc_mode);
+ DUMP_STRUCT_VALUE(fp, oxcf, rc_mode);
// buffer targeting aggressiveness
- DUMP_STRUCT_VALUE(oxcf, under_shoot_pct);
- DUMP_STRUCT_VALUE(oxcf, over_shoot_pct);
+ DUMP_STRUCT_VALUE(fp, oxcf, under_shoot_pct);
+ DUMP_STRUCT_VALUE(fp, oxcf, over_shoot_pct);
// buffering parameters
// TODO(angiebird): dump tarting_buffer_level_ms
@@ -1983,37 +2046,37 @@ void vp9_dump_encoder_config(const VP9EncoderConfig *oxcf) {
// TODO(angiebird): dump maximum_buffer_size_ms
// Frame drop threshold.
- DUMP_STRUCT_VALUE(oxcf, drop_frames_water_mark);
+ DUMP_STRUCT_VALUE(fp, oxcf, drop_frames_water_mark);
// controlling quality
- DUMP_STRUCT_VALUE(oxcf, fixed_q);
- DUMP_STRUCT_VALUE(oxcf, worst_allowed_q);
- DUMP_STRUCT_VALUE(oxcf, best_allowed_q);
- DUMP_STRUCT_VALUE(oxcf, cq_level);
- DUMP_STRUCT_VALUE(oxcf, aq_mode);
+ DUMP_STRUCT_VALUE(fp, oxcf, fixed_q);
+ DUMP_STRUCT_VALUE(fp, oxcf, worst_allowed_q);
+ DUMP_STRUCT_VALUE(fp, oxcf, best_allowed_q);
+ DUMP_STRUCT_VALUE(fp, oxcf, cq_level);
+ DUMP_STRUCT_VALUE(fp, oxcf, aq_mode);
// Special handling of Adaptive Quantization for AltRef frames
- DUMP_STRUCT_VALUE(oxcf, alt_ref_aq);
+ DUMP_STRUCT_VALUE(fp, oxcf, alt_ref_aq);
// Internal frame size scaling.
- DUMP_STRUCT_VALUE(oxcf, resize_mode);
- DUMP_STRUCT_VALUE(oxcf, scaled_frame_width);
- DUMP_STRUCT_VALUE(oxcf, scaled_frame_height);
+ DUMP_STRUCT_VALUE(fp, oxcf, resize_mode);
+ DUMP_STRUCT_VALUE(fp, oxcf, scaled_frame_width);
+ DUMP_STRUCT_VALUE(fp, oxcf, scaled_frame_height);
// Enable feature to reduce the frame quantization every x frames.
- DUMP_STRUCT_VALUE(oxcf, frame_periodic_boost);
+ DUMP_STRUCT_VALUE(fp, oxcf, frame_periodic_boost);
// two pass datarate control
- DUMP_STRUCT_VALUE(oxcf, two_pass_vbrbias);
- DUMP_STRUCT_VALUE(oxcf, two_pass_vbrmin_section);
- DUMP_STRUCT_VALUE(oxcf, two_pass_vbrmax_section);
- DUMP_STRUCT_VALUE(oxcf, vbr_corpus_complexity);
+ DUMP_STRUCT_VALUE(fp, oxcf, two_pass_vbrbias);
+ DUMP_STRUCT_VALUE(fp, oxcf, two_pass_vbrmin_section);
+ DUMP_STRUCT_VALUE(fp, oxcf, two_pass_vbrmax_section);
+ DUMP_STRUCT_VALUE(fp, oxcf, vbr_corpus_complexity);
// END DATARATE CONTROL OPTIONS
// ----------------------------------------------------------------
// Spatial and temporal scalability.
- DUMP_STRUCT_VALUE(oxcf, ss_number_layers);
- DUMP_STRUCT_VALUE(oxcf, ts_number_layers);
+ DUMP_STRUCT_VALUE(fp, oxcf, ss_number_layers);
+ DUMP_STRUCT_VALUE(fp, oxcf, ts_number_layers);
// Bitrate allocation for spatial layers.
// TODO(angiebird): dump layer_target_bitrate[VPX_MAX_LAYERS]
@@ -2021,25 +2084,25 @@ void vp9_dump_encoder_config(const VP9EncoderConfig *oxcf) {
// TODO(angiebird): dump ss_enable_auto_arf[VPX_SS_MAX_LAYERS]
// TODO(angiebird): dump ts_rate_decimator[VPX_TS_MAX_LAYERS]
- DUMP_STRUCT_VALUE(oxcf, enable_auto_arf);
- DUMP_STRUCT_VALUE(oxcf, encode_breakout);
- DUMP_STRUCT_VALUE(oxcf, error_resilient_mode);
- DUMP_STRUCT_VALUE(oxcf, frame_parallel_decoding_mode);
+ DUMP_STRUCT_VALUE(fp, oxcf, enable_auto_arf);
+ DUMP_STRUCT_VALUE(fp, oxcf, encode_breakout);
+ DUMP_STRUCT_VALUE(fp, oxcf, error_resilient_mode);
+ DUMP_STRUCT_VALUE(fp, oxcf, frame_parallel_decoding_mode);
- DUMP_STRUCT_VALUE(oxcf, arnr_max_frames);
- DUMP_STRUCT_VALUE(oxcf, arnr_strength);
+ DUMP_STRUCT_VALUE(fp, oxcf, arnr_max_frames);
+ DUMP_STRUCT_VALUE(fp, oxcf, arnr_strength);
- DUMP_STRUCT_VALUE(oxcf, min_gf_interval);
- DUMP_STRUCT_VALUE(oxcf, max_gf_interval);
+ DUMP_STRUCT_VALUE(fp, oxcf, min_gf_interval);
+ DUMP_STRUCT_VALUE(fp, oxcf, max_gf_interval);
- DUMP_STRUCT_VALUE(oxcf, tile_columns);
- DUMP_STRUCT_VALUE(oxcf, tile_rows);
+ DUMP_STRUCT_VALUE(fp, oxcf, tile_columns);
+ DUMP_STRUCT_VALUE(fp, oxcf, tile_rows);
- DUMP_STRUCT_VALUE(oxcf, enable_tpl_model);
+ DUMP_STRUCT_VALUE(fp, oxcf, enable_tpl_model);
- DUMP_STRUCT_VALUE(oxcf, max_threads);
+ DUMP_STRUCT_VALUE(fp, oxcf, max_threads);
- DUMP_STRUCT_VALUE(oxcf, target_level);
+ DUMP_STRUCT_VALUE(fp, oxcf, target_level);
// TODO(angiebird): dump two_pass_stats_in
@@ -2047,19 +2110,19 @@ void vp9_dump_encoder_config(const VP9EncoderConfig *oxcf) {
// TODO(angiebird): dump firstpass_mb_stats_in
#endif
- DUMP_STRUCT_VALUE(oxcf, tuning);
- DUMP_STRUCT_VALUE(oxcf, content);
+ DUMP_STRUCT_VALUE(fp, oxcf, tuning);
+ DUMP_STRUCT_VALUE(fp, oxcf, content);
#if CONFIG_VP9_HIGHBITDEPTH
- DUMP_STRUCT_VALUE(oxcf, use_highbitdepth);
+ DUMP_STRUCT_VALUE(fp, oxcf, use_highbitdepth);
#endif
- DUMP_STRUCT_VALUE(oxcf, color_space);
- DUMP_STRUCT_VALUE(oxcf, color_range);
- DUMP_STRUCT_VALUE(oxcf, render_width);
- DUMP_STRUCT_VALUE(oxcf, render_height);
- DUMP_STRUCT_VALUE(oxcf, temporal_layering_mode);
+ DUMP_STRUCT_VALUE(fp, oxcf, color_space);
+ DUMP_STRUCT_VALUE(fp, oxcf, color_range);
+ DUMP_STRUCT_VALUE(fp, oxcf, render_width);
+ DUMP_STRUCT_VALUE(fp, oxcf, render_height);
+ DUMP_STRUCT_VALUE(fp, oxcf, temporal_layering_mode);
- DUMP_STRUCT_VALUE(oxcf, row_mt);
- DUMP_STRUCT_VALUE(oxcf, motion_vector_unit_test);
+ DUMP_STRUCT_VALUE(fp, oxcf, row_mt);
+ DUMP_STRUCT_VALUE(fp, oxcf, motion_vector_unit_test);
}
FRAME_INFO vp9_get_frame_info(const VP9EncoderConfig *oxcf) {
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.h b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.h
index 08569fcc9..01338adb4 100644
--- a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.h
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.h
@@ -19,10 +19,10 @@ extern "C" {
VP9EncoderConfig vp9_get_encoder_config(int frame_width, int frame_height,
vpx_rational_t frame_rate,
- int target_bitrate,
+ int target_bitrate, int encode_speed,
vpx_enc_pass enc_pass);
-void vp9_dump_encoder_config(const VP9EncoderConfig *oxcf);
+void vp9_dump_encoder_config(const VP9EncoderConfig *oxcf, FILE *fp);
FRAME_INFO vp9_get_frame_info(const VP9EncoderConfig *oxcf);
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vp9/vp9_iface_common.c b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vp9/vp9_iface_common.c
index 74d08a587..8d031694d 100644
--- a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vp9/vp9_iface_common.c
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vp9/vp9_iface_common.c
@@ -88,8 +88,9 @@ vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
yv12->y_width = img->d_w;
yv12->y_height = img->d_h;
- yv12->uv_width =
- img->x_chroma_shift == 1 ? (1 + yv12->y_width) / 2 : yv12->y_width;
+ yv12->uv_width = img->x_chroma_shift == 1 || img->fmt == VPX_IMG_FMT_NV12
+ ? (1 + yv12->y_width) / 2
+ : yv12->y_width;
yv12->uv_height =
img->y_chroma_shift == 1 ? (1 + yv12->y_height) / 2 : yv12->y_height;
yv12->uv_crop_width = yv12->uv_width;
@@ -127,5 +128,9 @@ vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
#endif // CONFIG_VP9_HIGHBITDEPTH
yv12->subsampling_x = img->x_chroma_shift;
yv12->subsampling_y = img->y_chroma_shift;
+ // When reading the data, UV are in one plane for NV12 format, thus
+ // x_chroma_shift is 0. After converting, UV are in separate planes, and
+ // subsampling_x should be set to 1.
+ if (img->fmt == VPX_IMG_FMT_NV12) yv12->subsampling_x = 1;
return VPX_CODEC_OK;
}
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vp9/vp9cx.mk b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vp9/vp9cx.mk
index ad774505c..38e99165a 100644
--- a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vp9/vp9cx.mk
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vp9/vp9cx.mk
@@ -18,9 +18,6 @@ VP9_CX_SRCS_REMOVE-no += $(VP9_COMMON_SRCS_REMOVE-no)
VP9_CX_SRCS-yes += vp9_cx_iface.c
VP9_CX_SRCS-yes += vp9_cx_iface.h
-VP9_CX_SRCS-$(CONFIG_RATE_CTRL) += simple_encode.cc
-VP9_CX_SRCS-$(CONFIG_RATE_CTRL) += simple_encode.h
-
VP9_CX_SRCS-yes += encoder/vp9_bitstream.c
VP9_CX_SRCS-yes += encoder/vp9_context_tree.c
VP9_CX_SRCS-yes += encoder/vp9_context_tree.h
@@ -99,6 +96,8 @@ VP9_CX_SRCS-yes += encoder/vp9_skin_detection.c
VP9_CX_SRCS-yes += encoder/vp9_skin_detection.h
VP9_CX_SRCS-yes += encoder/vp9_noise_estimate.c
VP9_CX_SRCS-yes += encoder/vp9_noise_estimate.h
+VP9_CX_SRCS-yes += encoder/vp9_ext_ratectrl.c
+VP9_CX_SRCS-yes += encoder/vp9_ext_ratectrl.h
ifeq ($(CONFIG_VP9_POSTPROC),yes)
VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.h
VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.c
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/internal/vpx_codec_internal.h b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/internal/vpx_codec_internal.h
index 9eed85e5d..4ef93057f 100644
--- a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/internal/vpx_codec_internal.h
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/internal/vpx_codec_internal.h
@@ -27,13 +27,15 @@
*
*
* An application instantiates a specific decoder instance by using
- * vpx_codec_init() and a pointer to the algorithm's interface structure:
+ * vpx_codec_dec_init() and a pointer to the algorithm's interface structure:
*
* my_app.c:
* extern vpx_codec_iface_t my_codec;
* {
* vpx_codec_ctx_t algo;
- * res = vpx_codec_init(&algo, &my_codec);
+ * int threads = 4;
+ * vpx_codec_dec_cfg_t cfg = { threads, 0, 0 };
+ * res = vpx_codec_dec_init(&algo, &my_codec, &cfg, 0);
* }
*
*
@@ -66,7 +68,7 @@ typedef struct vpx_codec_priv_enc_mr_cfg vpx_codec_priv_enc_mr_cfg_t;
/*!\brief init function pointer prototype
*
* Performs algorithm-specific initialization of the decoder context. This
- * function is called by the generic vpx_codec_init() wrapper function, so
+ * function is called by vpx_codec_dec_init() and vpx_codec_enc_init(), so
* plugins implementing this interface may trust the input parameters to be
* properly initialized.
*
@@ -175,16 +177,15 @@ typedef const struct vpx_codec_ctrl_fn_map {
/*!\brief decode data function pointer prototype
*
* Processes a buffer of coded data. If the processing results in a new
- * decoded frame becoming available, #VPX_CODEC_CB_PUT_SLICE and
- * #VPX_CODEC_CB_PUT_FRAME events are generated as appropriate. This
- * function is called by the generic vpx_codec_decode() wrapper function,
- * so plugins implementing this interface may trust the input parameters
- * to be properly initialized.
+ * decoded frame becoming available, put_slice and put_frame callbacks
+ * are invoked as appropriate. This function is called by the generic
+ * vpx_codec_decode() wrapper function, so plugins implementing this
+ * interface may trust the input parameters to be properly initialized.
*
* \param[in] ctx Pointer to this instance's context
* \param[in] data Pointer to this block of new coded data. If
- * NULL, a #VPX_CODEC_CB_PUT_FRAME event is posted
- * for the previously decoded frame.
+ * NULL, the put_frame callback is invoked for
+ * the previously decoded frame.
* \param[in] data_sz Size of the coded data, in bytes.
*
* \return Returns #VPX_CODEC_OK if the coded data was processed completely
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/src/vpx_codec.c b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/src/vpx_codec.c
index 10331aa21..114b94e19 100644
--- a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/src/vpx_codec.c
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/src/vpx_codec.c
@@ -97,7 +97,7 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...) {
res = VPX_CODEC_INCAPABLE;
- for (entry = ctx->iface->ctrl_maps; entry && entry->fn; entry++) {
+ for (entry = ctx->iface->ctrl_maps; entry->fn; entry++) {
if (!entry->ctrl_id || entry->ctrl_id == ctrl_id) {
va_list ap;
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/src/vpx_decoder.c b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/src/vpx_decoder.c
index fc1c2bcca..427cd1bf4 100644
--- a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/src/vpx_decoder.c
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/src/vpx_decoder.c
@@ -138,9 +138,10 @@ vpx_codec_err_t vpx_codec_register_put_frame_cb(vpx_codec_ctx_t *ctx,
if (!ctx || !cb)
res = VPX_CODEC_INVALID_PARAM;
- else if (!ctx->iface || !ctx->priv ||
- !(ctx->iface->caps & VPX_CODEC_CAP_PUT_FRAME))
+ else if (!ctx->iface || !ctx->priv)
res = VPX_CODEC_ERROR;
+ else if (!(ctx->iface->caps & VPX_CODEC_CAP_PUT_FRAME))
+ res = VPX_CODEC_INCAPABLE;
else {
ctx->priv->dec.put_frame_cb.u.put_frame = cb;
ctx->priv->dec.put_frame_cb.user_priv = user_priv;
@@ -157,9 +158,10 @@ vpx_codec_err_t vpx_codec_register_put_slice_cb(vpx_codec_ctx_t *ctx,
if (!ctx || !cb)
res = VPX_CODEC_INVALID_PARAM;
- else if (!ctx->iface || !ctx->priv ||
- !(ctx->iface->caps & VPX_CODEC_CAP_PUT_SLICE))
+ else if (!ctx->iface || !ctx->priv)
res = VPX_CODEC_ERROR;
+ else if (!(ctx->iface->caps & VPX_CODEC_CAP_PUT_SLICE))
+ res = VPX_CODEC_INCAPABLE;
else {
ctx->priv->dec.put_slice_cb.u.put_slice = cb;
ctx->priv->dec.put_slice_cb.user_priv = user_priv;
@@ -176,9 +178,10 @@ vpx_codec_err_t vpx_codec_set_frame_buffer_functions(
if (!ctx || !cb_get || !cb_release) {
res = VPX_CODEC_INVALID_PARAM;
- } else if (!ctx->iface || !ctx->priv ||
- !(ctx->iface->caps & VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER)) {
+ } else if (!ctx->iface || !ctx->priv) {
res = VPX_CODEC_ERROR;
+ } else if (!(ctx->iface->caps & VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER)) {
+ res = VPX_CODEC_INCAPABLE;
} else {
res = ctx->iface->dec.set_fb_fn(get_alg_priv(ctx), cb_get, cb_release,
cb_priv);
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/src/vpx_image.c b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/src/vpx_image.c
index a7c6ec0ce..ff496b5d3 100644
--- a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/src/vpx_image.c
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/src/vpx_image.c
@@ -39,7 +39,8 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, vpx_img_fmt_t fmt,
/* Get sample size for this format */
switch (fmt) {
case VPX_IMG_FMT_I420:
- case VPX_IMG_FMT_YV12: bps = 12; break;
+ case VPX_IMG_FMT_YV12:
+ case VPX_IMG_FMT_NV12: bps = 12; break;
case VPX_IMG_FMT_I422:
case VPX_IMG_FMT_I440: bps = 16; break;
case VPX_IMG_FMT_I444: bps = 24; break;
@@ -51,6 +52,8 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, vpx_img_fmt_t fmt,
}
/* Get chroma shift values for this format */
+ // For VPX_IMG_FMT_NV12, xcs needs to be 0 such that UV data is all read at
+ // one time.
switch (fmt) {
case VPX_IMG_FMT_I420:
case VPX_IMG_FMT_YV12:
@@ -62,6 +65,7 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, vpx_img_fmt_t fmt,
switch (fmt) {
case VPX_IMG_FMT_I420:
+ case VPX_IMG_FMT_NV12:
case VPX_IMG_FMT_I440:
case VPX_IMG_FMT_YV12:
case VPX_IMG_FMT_I42016:
@@ -173,7 +177,12 @@ int vpx_img_set_rect(vpx_image_t *img, unsigned int x, unsigned int y,
data + x * bytes_per_sample + y * img->stride[VPX_PLANE_Y];
data += img->h * img->stride[VPX_PLANE_Y];
- if (!(img->fmt & VPX_IMG_FMT_UV_FLIP)) {
+ if (img->fmt == VPX_IMG_FMT_NV12) {
+ img->planes[VPX_PLANE_U] =
+ data + (x >> img->x_chroma_shift) +
+ (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_U];
+ img->planes[VPX_PLANE_V] = img->planes[VPX_PLANE_U] + 1;
+ } else if (!(img->fmt & VPX_IMG_FMT_UV_FLIP)) {
img->planes[VPX_PLANE_U] =
data + (x >> img->x_chroma_shift) * bytes_per_sample +
(y >> img->y_chroma_shift) * img->stride[VPX_PLANE_U];
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/vp8cx.h b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/vp8cx.h
index dcdd710c0..37ad07d33 100644
--- a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/vp8cx.h
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/vp8cx.h
@@ -17,6 +17,7 @@
*/
#include "./vp8.h"
#include "./vpx_encoder.h"
+#include "./vpx_ext_ratectrl.h"
/*!\file
* \brief Provides definitions for using VP8 or VP9 encoder algorithm within the
@@ -684,6 +685,33 @@ enum vp8e_enc_control_id {
* Supported in codecs: VP9
*/
VP9E_SET_DELTA_Q_UV,
+
+ /*!\brief Codec control function to disable increase Q on overshoot in CBR.
+ *
+ * 0: On (default), 1: Disable.
+ *
+ * Supported in codecs: VP9
+ */
+ VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR,
+
+ /*!\brief Codec control function to disable loopfilter.
+ *
+ * 0: Loopfilter on all frames, 1: Disable on non reference frames.
+ * 2: Disable on all frames.
+ *
+ * Supported in codecs: VP9
+ */
+ VP9E_SET_DISABLE_LOOPFILTER,
+
+ /*!\brief Codec control function to enable external rate control library.
+ *
+ * args[0]: path of the rate control library
+ *
+ * args[1]: private config of the rate control library
+ *
+ * Supported in codecs: VP9
+ */
+ VP9E_SET_EXTERNAL_RATE_CONTROL,
};
/*!\brief vpx 1-D scaling mode
@@ -1034,6 +1062,15 @@ VPX_CTRL_USE_TYPE(VP9E_SET_POSTENCODE_DROP, unsigned int)
VPX_CTRL_USE_TYPE(VP9E_SET_DELTA_Q_UV, int)
#define VPX_CTRL_VP9E_SET_DELTA_Q_UV
+VPX_CTRL_USE_TYPE(VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR, int)
+#define VPX_CTRL_VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR
+
+VPX_CTRL_USE_TYPE(VP9E_SET_DISABLE_LOOPFILTER, int)
+#define VPX_CTRL_VP9E_SET_DISABLE_LOOPFILTER
+
+VPX_CTRL_USE_TYPE(VP9E_SET_EXTERNAL_RATE_CONTROL, vpx_rc_funcs_t *)
+#define VPX_CTRL_VP9E_SET_EXTERNAL_RATE_CONTROL
+
/*!\endcond */
/*! @} - end defgroup vp8_encoder */
#ifdef __cplusplus
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/vpx_codec.h b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/vpx_codec.h
index 6371a6ca2..b0a931e01 100644
--- a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/vpx_codec.h
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/vpx_codec.h
@@ -22,13 +22,16 @@
* video codec algorithm.
*
* An application instantiates a specific codec instance by using
- * vpx_codec_init() and a pointer to the algorithm's interface structure:
+ * vpx_codec_dec_init() or vpx_codec_enc_init() and a pointer to the
+ * algorithm's interface structure:
*
* my_app.c:
* extern vpx_codec_iface_t my_codec;
* {
* vpx_codec_ctx_t algo;
- * res = vpx_codec_init(&algo, &my_codec);
+ * int threads = 4;
+ * vpx_codec_dec_cfg_t cfg = { threads, 0, 0 };
+ * res = vpx_codec_dec_init(&algo, &my_codec, &cfg, 0);
* }
*
*
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/vpx_codec.mk b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/vpx_codec.mk
index 4ed77ad6d..350dc247b 100644
--- a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/vpx_codec.mk
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/vpx_codec.mk
@@ -24,6 +24,7 @@ API_DOC_SRCS-$(CONFIG_VP8_DECODER) += vp8dx.h
API_DOC_SRCS-yes += vpx_codec.h
API_DOC_SRCS-yes += vpx_decoder.h
API_DOC_SRCS-yes += vpx_encoder.h
+API_DOC_SRCS-yes += vpx_ext_ratectrl.h
API_DOC_SRCS-yes += vpx_frame_buffer.h
API_DOC_SRCS-yes += vpx_image.h
@@ -39,3 +40,4 @@ API_SRCS-yes += vpx_codec.mk
API_SRCS-yes += vpx_frame_buffer.h
API_SRCS-yes += vpx_image.h
API_SRCS-yes += vpx_integer.h
+API_SRCS-yes += vpx_ext_ratectrl.h
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/vpx_decoder.h b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/vpx_decoder.h
index f113f7196..39e5f585f 100644
--- a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/vpx_decoder.h
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/vpx_decoder.h
@@ -58,6 +58,10 @@ extern "C" {
#define VPX_CODEC_CAP_ERROR_CONCEALMENT 0x80000
/*!\brief Can receive encoded frames one fragment at a time */
#define VPX_CODEC_CAP_INPUT_FRAGMENTS 0x100000
+/*!\brief Can support frame-based multi-threading */
+#define VPX_CODEC_CAP_FRAME_THREADING 0x200000
+/*!brief Can support external frame buffers */
+#define VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER 0x400000
/*! \brief Initialization-time Feature Enabling
*
@@ -66,11 +70,6 @@ extern "C" {
*
* The available flags are specified by VPX_CODEC_USE_* defines.
*/
-/*!\brief Can support frame-based multi-threading */
-#define VPX_CODEC_CAP_FRAME_THREADING 0x200000
-/*!brief Can support external frame buffers */
-#define VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER 0x400000
-
#define VPX_CODEC_USE_POSTPROC 0x10000 /**< Postprocess decoded frame */
/*!\brief Conceal errors in decoded frames */
#define VPX_CODEC_USE_ERROR_CONCEALMENT 0x20000
@@ -185,8 +184,8 @@ vpx_codec_err_t vpx_codec_get_stream_info(vpx_codec_ctx_t *ctx,
/*!\brief Decode data
*
* Processes a buffer of coded data. If the processing results in a new
- * decoded frame becoming available, PUT_SLICE and PUT_FRAME events may be
- * generated, as appropriate. Encoded data \ref MUST be passed in DTS (decode
+ * decoded frame becoming available, put_slice and put_frame callbacks may be
+ * invoked, as appropriate. Encoded data \ref MUST be passed in DTS (decode
* time stamp) order. Frames produced will always be in PTS (presentation
* time stamp) order.
* If the decoder is configured with VPX_CODEC_USE_INPUT_FRAGMENTS enabled,
@@ -199,8 +198,8 @@ vpx_codec_err_t vpx_codec_get_stream_info(vpx_codec_ctx_t *ctx,
*
* \param[in] ctx Pointer to this instance's context
* \param[in] data Pointer to this block of new coded data. If
- * NULL, a VPX_CODEC_CB_PUT_FRAME event is posted
- * for the previously decoded frame.
+ * NULL, the put_frame callback is invoked for
+ * the previously decoded frame.
* \param[in] data_sz Size of the coded data, in bytes.
* \param[in] user_priv Application specific data to associate with
* this frame.
@@ -236,11 +235,10 @@ vpx_image_t *vpx_codec_get_frame(vpx_codec_ctx_t *ctx, vpx_codec_iter_t *iter);
/*!\defgroup cap_put_frame Frame-Based Decoding Functions
*
- * The following functions are required to be implemented for all decoders
- * that advertise the VPX_CODEC_CAP_PUT_FRAME capability. Calling these
- * functions
- * for codecs that don't advertise this capability will result in an error
- * code being returned, usually VPX_CODEC_ERROR
+ * The following function is required to be implemented for all decoders
+ * that advertise the VPX_CODEC_CAP_PUT_FRAME capability. Calling this
+ * function for codecs that don't advertise this capability will result in
+ * an error code being returned, usually VPX_CODEC_INCAPABLE.
* @{
*/
@@ -264,8 +262,9 @@ typedef void (*vpx_codec_put_frame_cb_fn_t)(void *user_priv,
* \retval #VPX_CODEC_OK
* Callback successfully registered.
* \retval #VPX_CODEC_ERROR
- * Decoder context not initialized, or algorithm not capable of
- * posting slice completion.
+ * Decoder context not initialized.
+ * \retval #VPX_CODEC_INCAPABLE
+ * Algorithm not capable of posting frame completion.
*/
vpx_codec_err_t vpx_codec_register_put_frame_cb(vpx_codec_ctx_t *ctx,
vpx_codec_put_frame_cb_fn_t cb,
@@ -275,18 +274,17 @@ vpx_codec_err_t vpx_codec_register_put_frame_cb(vpx_codec_ctx_t *ctx,
/*!\defgroup cap_put_slice Slice-Based Decoding Functions
*
- * The following functions are required to be implemented for all decoders
- * that advertise the VPX_CODEC_CAP_PUT_SLICE capability. Calling these
- * functions
- * for codecs that don't advertise this capability will result in an error
- * code being returned, usually VPX_CODEC_ERROR
+ * The following function is required to be implemented for all decoders
+ * that advertise the VPX_CODEC_CAP_PUT_SLICE capability. Calling this
+ * function for codecs that don't advertise this capability will result in
+ * an error code being returned, usually VPX_CODEC_INCAPABLE.
* @{
*/
/*!\brief put slice callback prototype
*
* This callback is invoked by the decoder to notify the application of
- * the availability of partially decoded image data. The
+ * the availability of partially decoded image data.
*/
typedef void (*vpx_codec_put_slice_cb_fn_t)(void *user_priv,
const vpx_image_t *img,
@@ -305,8 +303,9 @@ typedef void (*vpx_codec_put_slice_cb_fn_t)(void *user_priv,
* \retval #VPX_CODEC_OK
* Callback successfully registered.
* \retval #VPX_CODEC_ERROR
- * Decoder context not initialized, or algorithm not capable of
- * posting slice completion.
+ * Decoder context not initialized.
+ * \retval #VPX_CODEC_INCAPABLE
+ * Algorithm not capable of posting slice completion.
*/
vpx_codec_err_t vpx_codec_register_put_slice_cb(vpx_codec_ctx_t *ctx,
vpx_codec_put_slice_cb_fn_t cb,
@@ -316,10 +315,10 @@ vpx_codec_err_t vpx_codec_register_put_slice_cb(vpx_codec_ctx_t *ctx,
/*!\defgroup cap_external_frame_buffer External Frame Buffer Functions
*
- * The following section is required to be implemented for all decoders
+ * The following function is required to be implemented for all decoders
* that advertise the VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER capability.
* Calling this function for codecs that don't advertise this capability
- * will result in an error code being returned, usually VPX_CODEC_ERROR.
+ * will result in an error code being returned, usually VPX_CODEC_INCAPABLE.
*
* \note
* Currently this only works with VP9.
@@ -344,8 +343,9 @@ vpx_codec_err_t vpx_codec_register_put_slice_cb(vpx_codec_ctx_t *ctx,
* \retval #VPX_CODEC_INVALID_PARAM
* One or more of the callbacks were NULL.
* \retval #VPX_CODEC_ERROR
- * Decoder context not initialized, or algorithm not capable of
- * using external frame buffers.
+ * Decoder context not initialized.
+ * \retval #VPX_CODEC_INCAPABLE
+ * Algorithm not capable of using external frame buffers.
*
* \note
* When decoding VP9, the application may be required to pass in at least
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/vpx_encoder.h b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/vpx_encoder.h
index c84d40f7f..39b2aef62 100644
--- a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/vpx_encoder.h
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/vpx_encoder.h
@@ -705,6 +705,7 @@ typedef struct vpx_svc_parameters {
int scaling_factor_den[VPX_MAX_LAYERS]; /**< Scaling factor-denominator */
int speed_per_layer[VPX_MAX_LAYERS]; /**< Speed setting for each sl */
int temporal_layering_mode; /**< Temporal layering mode */
+ int loopfilter_ctrl[VPX_MAX_LAYERS]; /**< Loopfilter ctrl for each sl */
} vpx_svc_extra_cfg_t;
/*!\brief Initialize an encoder instance
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/vpx_ext_ratectrl.h b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/vpx_ext_ratectrl.h
new file mode 100644
index 000000000..bb3caa614
--- /dev/null
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/vpx_ext_ratectrl.h
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) 2020 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_VPX_EXT_RATECTRL_H_
+#define VPX_VPX_VPX_EXT_RATECTRL_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "./vpx_integer.h"
+
+/*!\brief Abstract rate control model handler
+ *
+ * The encoder will receive the model handler from create_model() defined in
+ * vpx_rc_funcs_t.
+ */
+typedef void *vpx_rc_model_t;
+
+/*!\brief Encode frame decision made by the external rate control model
+ *
+ * The encoder will receive the decision from the external rate control model
+ * through get_encodeframe_decision() defined in vpx_rc_funcs_t.
+ */
+typedef struct vpx_rc_encodeframe_decision {
+ int q_index; /**< Quantizer step index [0..255]*/
+} vpx_rc_encodeframe_decision_t;
+
+/*!\brief Information for the frame to be encoded.
+ *
+ * The encoder will send the information to external rate control model through
+ * get_encodeframe_decision() defined in vpx_rc_funcs_t.
+ *
+ */
+typedef struct vpx_rc_encodeframe_info {
+ /*!
+ * 0: Key frame
+ * 1: Inter frame
+ * 2: Alternate reference frame
+ * 3: Overlay frame
+ * 4: Golden frame
+ */
+ int frame_type;
+ int show_index; /**< display index, starts from zero*/
+ int coding_index; /**< coding index, starts from zero*/
+ int ref_frame_coding_indexes[3]; /**< three reference frames' coding indices*/
+ /*!
+ * The validity of the three reference frames.
+ * 0: Invalid
+ * 1: Valid
+ */
+ int ref_frame_valid_list[3];
+} vpx_rc_encodeframe_info_t;
+
+/*!\brief Frame coding result
+ *
+ * The encoder will send the result to the external rate control model through
+ * update_encodeframe_result() defined in vpx_rc_funcs_t.
+ */
+typedef struct vpx_rc_encodeframe_result {
+ int64_t sse; /**< sum of squared error of the reconstructed frame */
+ int64_t bit_count; /**< number of bits spent on coding the frame*/
+ int64_t pixel_count; /**< number of pixels in YUV planes of the frame*/
+} vpx_rc_encodeframe_result_t;
+
+/*!\brief Status returned by rate control callback functions.
+ */
+typedef enum vpx_rc_status {
+ VPX_RC_OK = 0,
+ VPX_RC_ERROR = 1,
+} vpx_rc_status_t;
+
+/*!\brief First pass frame stats
+ * This is a mirror of vp9's FIRSTPASS_STATS except that spatial_layer_id is
+ * omitted
+ */
+typedef struct vpx_rc_frame_stats {
+ /*!
+ * Frame number in display order, if stats are for a single frame.
+ * No real meaning for a collection of frames.
+ */
+ double frame;
+ /*!
+ * Weight assigned to this frame (or total weight for the collection of
+ * frames) currently based on intra factor and brightness factor. This is used
+ * to distribute bits between easier and harder frames.
+ */
+ double weight;
+ /*!
+ * Intra prediction error.
+ */
+ double intra_error;
+ /*!
+ * Best of intra pred error and inter pred error using last frame as ref.
+ */
+ double coded_error;
+ /*!
+ * Best of intra pred error and inter pred error using golden frame as ref.
+ */
+ double sr_coded_error;
+ /*!
+ * Estimate the noise energy of the current frame.
+ */
+ double frame_noise_energy;
+ /*!
+ * Percentage of blocks with inter pred error < intra pred error.
+ */
+ double pcnt_inter;
+ /*!
+ * Percentage of blocks using (inter prediction and) non-zero motion vectors.
+ */
+ double pcnt_motion;
+ /*!
+ * Percentage of blocks where golden frame was better than last or intra:
+ * inter pred error using golden frame < inter pred error using last frame and
+ * inter pred error using golden frame < intra pred error
+ */
+ double pcnt_second_ref;
+ /*!
+ * Percentage of blocks where intra and inter prediction errors were very
+ * close. Note that this is a 'weighted count', that is, the so blocks may be
+ * weighted by how close the two errors were.
+ */
+ double pcnt_neutral;
+ /*!
+ * Percentage of blocks that have intra error < inter error and inter error <
+ * LOW_I_THRESH LOW_I_THRESH = 24000 using bit_depth 8 LOW_I_THRESH = 24000 <<
+ * 4 using bit_depth 10 LOW_I_THRESH = 24000 << 8 using bit_depth 12
+ */
+ double pcnt_intra_low;
+ /*!
+ * Percentage of blocks that have intra error < inter error and intra error <
+ * LOW_I_THRESH but inter error >= LOW_I_THRESH LOW_I_THRESH = 24000 using
+ * bit_depth 8 LOW_I_THRESH = 24000 << 4 using bit_depth 10 LOW_I_THRESH =
+ * 24000 << 8 using bit_depth 12
+ */
+ double pcnt_intra_high;
+ /*!
+ * Percentage of blocks that have almost no intra error residual
+ * (i.e. are in effect completely flat and untextured in the intra
+ * domain). In natural videos this is uncommon, but it is much more
+ * common in animations, graphics and screen content, so may be used
+ * as a signal to detect these types of content.
+ */
+ double intra_skip_pct;
+ /*!
+ * Percentage of blocks that have intra error < SMOOTH_INTRA_THRESH
+ * SMOOTH_INTRA_THRESH = 4000 using bit_depth 8
+ * SMOOTH_INTRA_THRESH = 4000 << 4 using bit_depth 10
+ * SMOOTH_INTRA_THRESH = 4000 << 8 using bit_depth 12
+ */
+ double intra_smooth_pct;
+ /*!
+ * Image mask rows top and bottom.
+ */
+ double inactive_zone_rows;
+ /*!
+ * Image mask columns at left and right edges.
+ */
+ double inactive_zone_cols;
+ /*!
+ * Average of row motion vectors.
+ */
+ double MVr;
+ /*!
+ * Mean of absolute value of row motion vectors.
+ */
+ double mvr_abs;
+ /*!
+ * Mean of column motion vectors.
+ */
+ double MVc;
+ /*!
+ * Mean of absolute value of column motion vectors.
+ */
+ double mvc_abs;
+ /*!
+ * Variance of row motion vectors.
+ */
+ double MVrv;
+ /*!
+ * Variance of column motion vectors.
+ */
+ double MVcv;
+ /*!
+ * Value in range [-1,1] indicating fraction of row and column motion vectors
+ * that point inwards (negative MV value) or outwards (positive MV value).
+ * For example, value of 1 indicates, all row/column MVs are inwards.
+ */
+ double mv_in_out_count;
+ /*!
+ * Duration of the frame / collection of frames.
+ */
+ double duration;
+ /*!
+ * 1.0 if stats are for a single frame, OR
+ * Number of frames in this collection for which the stats are accumulated.
+ */
+ double count;
+} vpx_rc_frame_stats_t;
+
+/*!\brief Collection of first pass frame stats
+ */
+typedef struct vpx_rc_firstpass_stats {
+ /*!
+ * Pointer to first pass frame stats.
+ * The pointed array of vpx_rc_frame_stats_t should have length equal to
+ * number of show frames in the video.
+ */
+ vpx_rc_frame_stats_t *frame_stats;
+ /*!
+ * Number of show frames in the video.
+ */
+ int num_frames;
+} vpx_rc_firstpass_stats_t;
+
+/*!\brief Encode config sent to external rate control model
+ */
+typedef struct vpx_rc_config {
+ int frame_width; /**< frame width */
+ int frame_height; /**< frame height */
+ int show_frame_count; /**< number of visible frames in the video */
+ /*!
+ * Target bitrate in kilobytes per second
+ */
+ int target_bitrate_kbps;
+ int frame_rate_num; /**< numerator of frame rate */
+ int frame_rate_den; /**< denominator of frame rate */
+} vpx_rc_config_t;
+
+/*!\brief Create an external rate control model callback prototype
+ *
+ * This callback is invoked by the encoder to create an external rate control
+ * model.
+ *
+ * \param[in] priv Callback's private data
+ * \param[in] ratectrl_config Pointer to vpx_rc_config_t
+ * \param[out] rate_ctrl_model_pt Pointer to vpx_rc_model_t
+ */
+typedef vpx_rc_status_t (*vpx_rc_create_model_cb_fn_t)(
+ void *priv, const vpx_rc_config_t *ratectrl_config,
+ vpx_rc_model_t *rate_ctrl_model_pt);
+
+/*!\brief Send first pass stats to the external rate control model callback
+ * prototype
+ *
+ * This callback is invoked by the encoder to send first pass stats to the
+ * external rate control model.
+ *
+ * \param[in] rate_ctrl_model rate control model
+ * \param[in] first_pass_stats first pass stats
+ */
+typedef vpx_rc_status_t (*vpx_rc_send_firstpass_stats_cb_fn_t)(
+ vpx_rc_model_t rate_ctrl_model,
+ const vpx_rc_firstpass_stats_t *first_pass_stats);
+
+/*!\brief Receive encode frame decision callback prototype
+ *
+ * This callback is invoked by the encoder to receive encode frame decision from
+ * the external rate control model.
+ *
+ * \param[in] rate_ctrl_model rate control model
+ * \param[in] encode_frame_info information of the coding frame
+ * \param[out] frame_decision encode decision of the coding frame
+ */
+typedef vpx_rc_status_t (*vpx_rc_get_encodeframe_decision_cb_fn_t)(
+ vpx_rc_model_t rate_ctrl_model,
+ const vpx_rc_encodeframe_info_t *encode_frame_info,
+ vpx_rc_encodeframe_decision_t *frame_decision);
+
+/*!\brief Update encode frame result callback prototype
+ *
+ * This callback is invoked by the encoder to update encode frame result to the
+ * external rate control model.
+ *
+ * \param[in] rate_ctrl_model rate control model
+ * \param[out] encode_frame_result encode result of the coding frame
+ */
+typedef vpx_rc_status_t (*vpx_rc_update_encodeframe_result_cb_fn_t)(
+ vpx_rc_model_t rate_ctrl_model,
+ const vpx_rc_encodeframe_result_t *encode_frame_result);
+
+/*!\brief Delete the external rate control model callback prototype
+ *
+ * This callback is invoked by the encoder to delete the external rate control
+ * model.
+ *
+ * \param[in] rate_ctrl_model rate control model
+ */
+typedef vpx_rc_status_t (*vpx_rc_delete_model_cb_fn_t)(
+ vpx_rc_model_t rate_ctrl_model);
+
+/*!\brief Callback function set for external rate control.
+ *
+ * The user can enable external rate control by registering
+ * a set of callback functions with the codec control flag
+ * VP9E_SET_EXTERNAL_RATE_CONTROL.
+ */
+typedef struct vpx_rc_funcs {
+ /*!
+ * Create an external rate control model.
+ */
+ vpx_rc_create_model_cb_fn_t create_model;
+ /*!
+ * Send first pass stats to the external rate control model.
+ */
+ vpx_rc_send_firstpass_stats_cb_fn_t send_firstpass_stats;
+ /*!
+ * Get encodeframe decision from the external rate control model.
+ */
+ vpx_rc_get_encodeframe_decision_cb_fn_t get_encodeframe_decision;
+ /*!
+ * Update encodeframe result to the external rate control model.
+ */
+ vpx_rc_update_encodeframe_result_cb_fn_t update_encodeframe_result;
+ /*!
+ * Delete the external rate control model.
+ */
+ vpx_rc_delete_model_cb_fn_t delete_model;
+ /*!
+ * Private data for the external rate control model.
+ */
+ void *priv;
+} vpx_rc_funcs_t;
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VPX_VPX_EXT_RATECTRL_H_
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/vpx_image.h b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/vpx_image.h
index 98be5966a..bc23be50c 100644
--- a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/vpx_image.h
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx/vpx_image.h
@@ -43,6 +43,7 @@ typedef enum vpx_img_fmt {
VPX_IMG_FMT_I422 = VPX_IMG_FMT_PLANAR | 5,
VPX_IMG_FMT_I444 = VPX_IMG_FMT_PLANAR | 6,
VPX_IMG_FMT_I440 = VPX_IMG_FMT_PLANAR | 7,
+ VPX_IMG_FMT_NV12 = VPX_IMG_FMT_PLANAR | 9,
VPX_IMG_FMT_I42016 = VPX_IMG_FMT_I420 | VPX_IMG_FMT_HIGHBITDEPTH,
VPX_IMG_FMT_I42216 = VPX_IMG_FMT_I422 | VPX_IMG_FMT_HIGHBITDEPTH,
VPX_IMG_FMT_I44416 = VPX_IMG_FMT_I444 | VPX_IMG_FMT_HIGHBITDEPTH,
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/mips/macros_msa.h b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/mips/macros_msa.h
index a3a5a4dfe..3c2f50c79 100644
--- a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/mips/macros_msa.h
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/mips/macros_msa.h
@@ -88,10 +88,10 @@
const uint8_t *psrc_lw_m = (const uint8_t *)(psrc); \
uint32_t val_lw_m; \
\
- __asm__ __volatile__("ulw %[val_lw_m], %[psrc_lw_m] \n\t" \
- \
- : [val_lw_m] "=r"(val_lw_m) \
- : [psrc_lw_m] "m"(*psrc_lw_m)); \
+ __asm__ __volatile__("lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \
+ "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \
+ : [val_lw_m] "=&r"(val_lw_m) \
+ : [psrc_lw_m] "r"(psrc_lw_m)); \
\
val_lw_m; \
})
@@ -102,10 +102,10 @@
const uint8_t *psrc_ld_m = (const uint8_t *)(psrc); \
uint64_t val_ld_m = 0; \
\
- __asm__ __volatile__("uld %[val_ld_m], %[psrc_ld_m] \n\t" \
- \
- : [val_ld_m] "=r"(val_ld_m) \
- : [psrc_ld_m] "m"(*psrc_ld_m)); \
+ __asm__ __volatile__("ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \
+ "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \
+ : [val_ld_m] "=&r"(val_ld_m) \
+ : [psrc_ld_m] "r"(psrc_ld_m)); \
\
val_ld_m; \
})
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/mips/sad_mmi.c b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/mips/sad_mmi.c
index 4368db5fd..eaca4773f 100644
--- a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/mips/sad_mmi.c
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/mips/sad_mmi.c
@@ -364,8 +364,9 @@ static inline unsigned int vpx_sad64x(const uint8_t *src, int src_stride,
double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
mips_reg l_counter = counter;
+ /* clang-format off */
__asm__ volatile (
- "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
+ "pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
"1: \n\t"
// Include two loop body, to reduce loop time.
SAD_SRC_REF_ABS_SUB_64
@@ -383,6 +384,7 @@ static inline unsigned int vpx_sad64x(const uint8_t *src, int src_stride,
: [src_stride]"r"((mips_reg)src_stride),
[ref_stride]"r"((mips_reg)ref_stride)
);
+ /* clang-format on */
return sad;
}
@@ -405,9 +407,11 @@ static inline unsigned int vpx_sad_avg64x(const uint8_t *src, int src_stride,
unsigned int sad;
double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
mips_reg l_counter = counter;
+ mips_reg l_second_pred = (mips_reg)second_pred;
+ /* clang-format off */
__asm__ volatile (
- "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
+ "pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
"1: \n\t"
// Include two loop body, to reduce loop time.
SAD_SRC_AVGREF_ABS_SUB_64
@@ -424,11 +428,12 @@ static inline unsigned int vpx_sad_avg64x(const uint8_t *src, int src_stride,
: [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
[ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
[src]"+&r"(src), [ref]"+&r"(ref),
- [second_pred]"+&r"((mips_reg)second_pred),
+ [second_pred]"+&r"(l_second_pred),
[sad]"=&r"(sad)
: [src_stride]"r"((mips_reg)src_stride),
[ref_stride]"r"((mips_reg)ref_stride)
);
+ /* clang-format on */
return sad;
}
@@ -450,8 +455,9 @@ static inline unsigned int vpx_sad32x(const uint8_t *src, int src_stride,
double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
mips_reg l_counter = counter;
+ /* clang-format off */
__asm__ volatile (
- "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
+ "pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
"1: \n\t"
// Include two loop body, to reduce loop time.
SAD_SRC_REF_ABS_SUB_32
@@ -469,6 +475,7 @@ static inline unsigned int vpx_sad32x(const uint8_t *src, int src_stride,
: [src_stride]"r"((mips_reg)src_stride),
[ref_stride]"r"((mips_reg)ref_stride)
);
+ /* clang-format on */
return sad;
}
@@ -493,9 +500,11 @@ static inline unsigned int vpx_sad_avg32x(const uint8_t *src, int src_stride,
unsigned int sad;
double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
mips_reg l_counter = counter;
+ mips_reg l_second_pred = (mips_reg)second_pred;
+ /* clang-format off */
__asm__ volatile (
- "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
+ "pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
"1: \n\t"
// Include two loop body, to reduce loop time.
SAD_SRC_AVGREF_ABS_SUB_32
@@ -512,11 +521,12 @@ static inline unsigned int vpx_sad_avg32x(const uint8_t *src, int src_stride,
: [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
[ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
[src]"+&r"(src), [ref]"+&r"(ref),
- [second_pred]"+&r"((mips_reg)second_pred),
+ [second_pred]"+&r"(l_second_pred),
[sad]"=&r"(sad)
: [src_stride]"r"((mips_reg)src_stride),
[ref_stride]"r"((mips_reg)ref_stride)
);
+ /* clang-format on */
return sad;
}
@@ -539,8 +549,9 @@ static inline unsigned int vpx_sad16x(const uint8_t *src, int src_stride,
double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
mips_reg l_counter = counter;
+ /* clang-format off */
__asm__ volatile (
- "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
+ "pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
"1: \n\t"
// Include two loop body, to reduce loop time.
SAD_SRC_REF_ABS_SUB_16
@@ -558,6 +569,7 @@ static inline unsigned int vpx_sad16x(const uint8_t *src, int src_stride,
: [src_stride]"r"((mips_reg)src_stride),
[ref_stride]"r"((mips_reg)ref_stride)
);
+ /* clang-format on */
return sad;
}
@@ -586,9 +598,11 @@ static inline unsigned int vpx_sad_avg16x(const uint8_t *src, int src_stride,
unsigned int sad;
double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
mips_reg l_counter = counter;
+ mips_reg l_second_pred = (mips_reg)second_pred;
+ /* clang-format off */
__asm__ volatile (
- "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
+ "pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
"1: \n\t"
// Include two loop body, to reduce loop time.
SAD_SRC_AVGREF_ABS_SUB_16
@@ -605,11 +619,12 @@ static inline unsigned int vpx_sad_avg16x(const uint8_t *src, int src_stride,
: [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
[ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
[src]"+&r"(src), [ref]"+&r"(ref),
- [second_pred]"+&r"((mips_reg)second_pred),
+ [second_pred]"+&r"(l_second_pred),
[sad]"=&r"(sad)
: [src_stride]"r"((mips_reg)src_stride),
[ref_stride]"r"((mips_reg)ref_stride)
);
+ /* clang-format on */
return sad;
}
@@ -632,8 +647,9 @@ static inline unsigned int vpx_sad8x(const uint8_t *src, int src_stride,
double ftmp1, ftmp2, ftmp3;
mips_reg l_counter = counter;
+ /* clang-format off */
__asm__ volatile (
- "xor %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
+ "pxor %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
"1: \n\t"
// Include two loop body, to reduce loop time.
SAD_SRC_REF_ABS_SUB_8
@@ -651,6 +667,7 @@ static inline unsigned int vpx_sad8x(const uint8_t *src, int src_stride,
: [src_stride]"r"((mips_reg)src_stride),
[ref_stride]"r"((mips_reg)ref_stride)
);
+ /* clang-format on */
return sad;
}
@@ -679,9 +696,11 @@ static inline unsigned int vpx_sad_avg8x(const uint8_t *src, int src_stride,
unsigned int sad;
double ftmp1, ftmp2, ftmp3;
mips_reg l_counter = counter;
+ mips_reg l_second_pred = (mips_reg)second_pred;
+ /* clang-format off */
__asm__ volatile (
- "xor %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
+ "pxor %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
"1: \n\t"
// Include two loop body, to reduce loop time.
SAD_SRC_AVGREF_ABS_SUB_8
@@ -697,11 +716,12 @@ static inline unsigned int vpx_sad_avg8x(const uint8_t *src, int src_stride,
"mfc1 %[sad], %[ftmp3] \n\t"
: [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
[counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),
- [second_pred]"+&r"((mips_reg)second_pred),
+ [second_pred]"+&r"(l_second_pred),
[sad]"=&r"(sad)
: [src_stride]"r"((mips_reg)src_stride),
[ref_stride]"r"((mips_reg)ref_stride)
);
+ /* clang-format on */
return sad;
}
@@ -724,8 +744,9 @@ static inline unsigned int vpx_sad4x(const uint8_t *src, int src_stride,
double ftmp1, ftmp2, ftmp3;
mips_reg l_counter = counter;
+ /* clang-format off */
__asm__ volatile (
- "xor %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
+ "pxor %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
"1: \n\t"
// Include two loop body, to reduce loop time.
SAD_SRC_REF_ABS_SUB_4
@@ -743,6 +764,7 @@ static inline unsigned int vpx_sad4x(const uint8_t *src, int src_stride,
: [src_stride]"r"((mips_reg)src_stride),
[ref_stride]"r"((mips_reg)ref_stride)
);
+ /* clang-format on */
return sad;
}
@@ -767,9 +789,11 @@ static inline unsigned int vpx_sad_avg4x(const uint8_t *src, int src_stride,
unsigned int sad;
double ftmp1, ftmp2, ftmp3;
mips_reg l_counter = counter;
+ mips_reg l_second_pred = (mips_reg)second_pred;
+ /* clang-format off */
__asm__ volatile (
- "xor %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
+ "pxor %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
"1: \n\t"
// Include two loop body, to reduce loop time.
SAD_SRC_AVGREF_ABS_SUB_4
@@ -785,11 +809,12 @@ static inline unsigned int vpx_sad_avg4x(const uint8_t *src, int src_stride,
"mfc1 %[sad], %[ftmp3] \n\t"
: [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
[counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),
- [second_pred]"+&r"((mips_reg)second_pred),
+ [second_pred]"+&r"(l_second_pred),
[sad]"=&r"(sad)
: [src_stride]"r"((mips_reg)src_stride),
[ref_stride]"r"((mips_reg)ref_stride)
);
+ /* clang-format on */
return sad;
}
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/mips/subtract_mmi.c b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/mips/subtract_mmi.c
index 9f361704a..8bd7e6977 100644
--- a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/mips/subtract_mmi.c
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/mips/subtract_mmi.c
@@ -24,7 +24,7 @@ void vpx_subtract_block_mmi(int rows, int cols, int16_t *diff,
switch (rows) {
case 4:
__asm__ volatile(
- "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
#if _MIPS_SIM == _ABIO32
"ulw %[tmp0], 0x00(%[src]) \n\t"
"mtc1 %[tmp0], %[ftmp1] \n\t"
@@ -118,7 +118,7 @@ void vpx_subtract_block_mmi(int rows, int cols, int16_t *diff,
break;
case 8:
__asm__ volatile(
- "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
"li %[tmp0], 0x02 \n\t"
"1: \n\t"
"gsldlc1 %[ftmp1], 0x07(%[src]) \n\t"
@@ -206,7 +206,7 @@ void vpx_subtract_block_mmi(int rows, int cols, int16_t *diff,
break;
case 16:
__asm__ volatile(
- "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
"li %[tmp0], 0x08 \n\t"
"1: \n\t"
"gsldlc1 %[ftmp1], 0x07(%[src]) \n\t"
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/mips/variance_mmi.c b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/mips/variance_mmi.c
index c1780c33a..c2adcfa01 100644
--- a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/mips/variance_mmi.c
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/mips/variance_mmi.c
@@ -150,7 +150,7 @@ static const uint8_t bilinear_filters[8][2] = {
"psrlh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" \
\
/* store: temp2[0] ~ temp2[3] */ \
- "and %[ftmp2], %[ftmp2], %[mask] \n\t" \
+ "pand %[ftmp2], %[ftmp2], %[mask] \n\t" \
"packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \
"gssdrc1 %[ftmp2], 0x00(%[temp2_ptr]) \n\t"
@@ -163,7 +163,7 @@ static const uint8_t bilinear_filters[8][2] = {
"psrlh %[ftmp4], %[ftmp4], %[ftmp6] \n\t" \
\
/* store: temp2[0] ~ temp2[3] */ \
- "and %[ftmp4], %[ftmp4], %[mask] \n\t" \
+ "pand %[ftmp4], %[ftmp4], %[mask] \n\t" \
"packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
"gssdrc1 %[ftmp4], 0x00(%[temp2_ptr]) \n\t"
@@ -225,8 +225,8 @@ static const uint8_t bilinear_filters[8][2] = {
"psrlh %[ftmp3], %[ftmp3], %[ftmp14] \n\t" \
\
/* store: temp2[0] ~ temp2[7] */ \
- "and %[ftmp2], %[ftmp2], %[mask] \n\t" \
- "and %[ftmp3], %[ftmp3], %[mask] \n\t" \
+ "pand %[ftmp2], %[ftmp2], %[mask] \n\t" \
+ "pand %[ftmp3], %[ftmp3], %[mask] \n\t" \
"packushb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \
"gssdlc1 %[ftmp2], 0x07(%[temp2_ptr]) \n\t" \
"gssdrc1 %[ftmp2], 0x00(%[temp2_ptr]) \n\t"
@@ -247,8 +247,8 @@ static const uint8_t bilinear_filters[8][2] = {
"psrlh %[ftmp9], %[ftmp9], %[ftmp14] \n\t" \
\
/* store: temp2[0] ~ temp2[7] */ \
- "and %[ftmp8], %[ftmp8], %[mask] \n\t" \
- "and %[ftmp9], %[ftmp9], %[mask] \n\t" \
+ "pand %[ftmp8], %[ftmp8], %[mask] \n\t" \
+ "pand %[ftmp9], %[ftmp9], %[mask] \n\t" \
"packushb %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \
"gssdlc1 %[ftmp8], 0x07(%[temp2_ptr]) \n\t" \
"gssdrc1 %[ftmp8], 0x00(%[temp2_ptr]) \n\t"
@@ -319,8 +319,8 @@ static const uint8_t bilinear_filters[8][2] = {
"psrlh %[ftmp5], %[ftmp5], %[ftmp14] \n\t" \
\
/* store: temp2[8] ~ temp2[15] */ \
- "and %[ftmp4], %[ftmp4], %[mask] \n\t" \
- "and %[ftmp5], %[ftmp5], %[mask] \n\t" \
+ "pand %[ftmp4], %[ftmp4], %[mask] \n\t" \
+ "pand %[ftmp5], %[ftmp5], %[mask] \n\t" \
"packushb %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \
"gssdlc1 %[ftmp4], 0x0f(%[temp2_ptr]) \n\t" \
"gssdrc1 %[ftmp4], 0x08(%[temp2_ptr]) \n\t"
@@ -343,8 +343,8 @@ static const uint8_t bilinear_filters[8][2] = {
"psrlh %[ftmp11], %[ftmp11], %[ftmp14] \n\t" \
\
/* store: temp2[8] ~ temp2[15] */ \
- "and %[ftmp10], %[ftmp10], %[mask] \n\t" \
- "and %[ftmp11], %[ftmp11], %[mask] \n\t" \
+ "pand %[ftmp10], %[ftmp10], %[mask] \n\t" \
+ "pand %[ftmp11], %[ftmp11], %[mask] \n\t" \
"packushb %[ftmp10], %[ftmp10], %[ftmp11] \n\t" \
"gssdlc1 %[ftmp10], 0x0f(%[temp2_ptr]) \n\t" \
"gssdrc1 %[ftmp10], 0x08(%[temp2_ptr]) \n\t"
@@ -414,13 +414,14 @@ static inline uint32_t vpx_variance64x(const uint8_t *src_ptr, int src_stride,
*sse = 0;
+ /* clang-format off */
__asm__ volatile (
"li %[tmp0], 0x20 \n\t"
"mtc1 %[tmp0], %[ftmp11] \n\t"
MMI_L(%[tmp0], %[high], 0x00)
- "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
- "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
- "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "pxor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
+ "pxor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
"1: \n\t"
"gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
"gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
@@ -478,7 +479,7 @@ static inline uint32_t vpx_variance64x(const uint8_t *src_ptr, int src_stride,
"mfc1 %[tmp1], %[ftmp9] \n\t"
"mfhc1 %[tmp2], %[ftmp9] \n\t"
"addu %[sum], %[tmp1], %[tmp2] \n\t"
- "dsrl %[ftmp1], %[ftmp10], %[ftmp11] \n\t"
+ "ssrld %[ftmp1], %[ftmp10], %[ftmp11] \n\t"
"paddw %[ftmp1], %[ftmp1], %[ftmp10] \n\t"
"swc1 %[ftmp1], 0x00(%[sse]) \n\t"
: [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
@@ -496,6 +497,7 @@ static inline uint32_t vpx_variance64x(const uint8_t *src_ptr, int src_stride,
[high]"r"(&high), [sse]"r"(sse)
: "memory"
);
+ /* clang-format on */
return *sse - (((int64_t)sum * sum) / (64 * high));
}
@@ -519,13 +521,14 @@ uint32_t vpx_variance32x64_mmi(const uint8_t *src_ptr, int src_stride,
*sse = 0;
+ /* clang-format off */
__asm__ volatile (
"li %[tmp0], 0x20 \n\t"
"mtc1 %[tmp0], %[ftmp11] \n\t"
"li %[tmp0], 0x40 \n\t"
- "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
- "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
- "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "pxor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
+ "pxor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
"1: \n\t"
"gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
"gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
@@ -559,7 +562,7 @@ uint32_t vpx_variance32x64_mmi(const uint8_t *src_ptr, int src_stride,
"mfc1 %[tmp1], %[ftmp9] \n\t"
"mfhc1 %[tmp2], %[ftmp9] \n\t"
"addu %[sum], %[tmp1], %[tmp2] \n\t"
- "dsrl %[ftmp1], %[ftmp10], %[ftmp11] \n\t"
+ "ssrld %[ftmp1], %[ftmp10], %[ftmp11] \n\t"
"paddw %[ftmp1], %[ftmp1], %[ftmp10] \n\t"
"swc1 %[ftmp1], 0x00(%[sse]) \n\t"
: [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
@@ -577,6 +580,7 @@ uint32_t vpx_variance32x64_mmi(const uint8_t *src_ptr, int src_stride,
[sse]"r"(sse)
: "memory"
);
+ /* clang-format on */
return *sse - (((int64_t)sum * sum) / 2048);
}
@@ -590,14 +594,15 @@ static inline uint32_t vpx_variance32x(const uint8_t *src_ptr, int src_stride,
*sse = 0;
+ /* clang-format off */
__asm__ volatile (
"li %[tmp0], 0x20 \n\t"
"mtc1 %[tmp0], %[ftmp11] \n\t"
MMI_L(%[tmp0], %[high], 0x00)
- "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
- "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
- "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
- "xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t"
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
+ "pxor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
+ "pxor %[ftmp12], %[ftmp12], %[ftmp12] \n\t"
"1: \n\t"
"gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
"gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
@@ -625,7 +630,7 @@ static inline uint32_t vpx_variance32x(const uint8_t *src_ptr, int src_stride,
MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
"bnez %[tmp0], 1b \n\t"
- "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
+ "ssrld %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
"paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t"
"swc1 %[ftmp9], 0x00(%[sse]) \n\t"
@@ -636,7 +641,7 @@ static inline uint32_t vpx_variance32x(const uint8_t *src_ptr, int src_stride,
"paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
"psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
"psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t"
- "dsrl %[ftmp0], %[ftmp3], %[ftmp11] \n\t"
+ "ssrld %[ftmp0], %[ftmp3], %[ftmp11] \n\t"
"paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
"swc1 %[ftmp0], 0x00(%[sum]) \n\t"
@@ -653,6 +658,7 @@ static inline uint32_t vpx_variance32x(const uint8_t *src_ptr, int src_stride,
[high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
: "memory"
);
+ /* clang-format on */
return *sse - (((int64_t)sum * sum) / (32 * high));
}
@@ -676,14 +682,15 @@ static inline uint32_t vpx_variance16x(const uint8_t *src_ptr, int src_stride,
*sse = 0;
+ /* clang-format off */
__asm__ volatile (
"li %[tmp0], 0x20 \n\t"
"mtc1 %[tmp0], %[ftmp11] \n\t"
MMI_L(%[tmp0], %[high], 0x00)
- "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
- "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
- "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
- "xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t"
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
+ "pxor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
+ "pxor %[ftmp12], %[ftmp12], %[ftmp12] \n\t"
"1: \n\t"
"gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
"gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
@@ -701,7 +708,7 @@ static inline uint32_t vpx_variance16x(const uint8_t *src_ptr, int src_stride,
MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
"bnez %[tmp0], 1b \n\t"
- "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
+ "ssrld %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
"paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t"
"swc1 %[ftmp9], 0x00(%[sse]) \n\t"
@@ -712,7 +719,7 @@ static inline uint32_t vpx_variance16x(const uint8_t *src_ptr, int src_stride,
"paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
"psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
"psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t"
- "dsrl %[ftmp0], %[ftmp3], %[ftmp11] \n\t"
+ "ssrld %[ftmp0], %[ftmp3], %[ftmp11] \n\t"
"paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
"swc1 %[ftmp0], 0x00(%[sum]) \n\t"
@@ -729,6 +736,7 @@ static inline uint32_t vpx_variance16x(const uint8_t *src_ptr, int src_stride,
[high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
: "memory"
);
+ /* clang-format on */
return *sse - (((int64_t)sum * sum) / (16 * high));
}
@@ -753,14 +761,15 @@ static inline uint32_t vpx_variance8x(const uint8_t *src_ptr, int src_stride,
*sse = 0;
+ /* clang-format off */
__asm__ volatile (
"li %[tmp0], 0x20 \n\t"
"mtc1 %[tmp0], %[ftmp11] \n\t"
MMI_L(%[tmp0], %[high], 0x00)
- "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
- "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
- "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
- "xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t"
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
+ "pxor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
+ "pxor %[ftmp12], %[ftmp12], %[ftmp12] \n\t"
"1: \n\t"
"gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
"gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
@@ -773,7 +782,7 @@ static inline uint32_t vpx_variance8x(const uint8_t *src_ptr, int src_stride,
MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
"bnez %[tmp0], 1b \n\t"
- "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
+ "ssrld %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
"paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t"
"swc1 %[ftmp9], 0x00(%[sse]) \n\t"
@@ -784,7 +793,7 @@ static inline uint32_t vpx_variance8x(const uint8_t *src_ptr, int src_stride,
"paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
"psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
"psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t"
- "dsrl %[ftmp0], %[ftmp3], %[ftmp11] \n\t"
+ "ssrld %[ftmp0], %[ftmp3], %[ftmp11] \n\t"
"paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
"swc1 %[ftmp0], 0x00(%[sum]) \n\t"
@@ -801,6 +810,7 @@ static inline uint32_t vpx_variance8x(const uint8_t *src_ptr, int src_stride,
[high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
: "memory"
);
+ /* clang-format on */
return *sse - (((int64_t)sum * sum) / (8 * high));
}
@@ -825,14 +835,15 @@ static inline uint32_t vpx_variance4x(const uint8_t *src_ptr, int src_stride,
*sse = 0;
+ /* clang-format off */
__asm__ volatile (
"li %[tmp0], 0x20 \n\t"
"mtc1 %[tmp0], %[ftmp10] \n\t"
MMI_L(%[tmp0], %[high], 0x00)
- "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
- "xor %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
- "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
- "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "pxor %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
+ "pxor %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
+ "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
"1: \n\t"
"gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
"gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
@@ -845,7 +856,7 @@ static inline uint32_t vpx_variance4x(const uint8_t *src_ptr, int src_stride,
MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
"bnez %[tmp0], 1b \n\t"
- "dsrl %[ftmp9], %[ftmp6], %[ftmp10] \n\t"
+ "ssrld %[ftmp9], %[ftmp6], %[ftmp10] \n\t"
"paddw %[ftmp9], %[ftmp9], %[ftmp6] \n\t"
"swc1 %[ftmp9], 0x00(%[sse]) \n\t"
@@ -856,7 +867,7 @@ static inline uint32_t vpx_variance4x(const uint8_t *src_ptr, int src_stride,
"paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
"psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
"psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t"
- "dsrl %[ftmp0], %[ftmp3], %[ftmp10] \n\t"
+ "ssrld %[ftmp0], %[ftmp3], %[ftmp10] \n\t"
"paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
"swc1 %[ftmp0], 0x00(%[sum]) \n\t"
: [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
@@ -872,6 +883,7 @@ static inline uint32_t vpx_variance4x(const uint8_t *src_ptr, int src_stride,
[high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
: "memory"
);
+ /* clang-format on */
return *sse - (((int64_t)sum * sum) / (4 * high));
}
@@ -894,12 +906,13 @@ static inline uint32_t vpx_mse16x(const uint8_t *src_ptr, int src_stride,
*sse = 0;
+ /* clang-format off */
__asm__ volatile (
"li %[tmp0], 0x20 \n\t"
"mtc1 %[tmp0], %[ftmp11] \n\t"
MMI_L(%[tmp0], %[high], 0x00)
- "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
- "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
"1: \n\t"
VARIANCE_SSE_16
@@ -909,7 +922,7 @@ static inline uint32_t vpx_mse16x(const uint8_t *src_ptr, int src_stride,
MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
"bnez %[tmp0], 1b \n\t"
- "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
+ "ssrld %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
"paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t"
"swc1 %[ftmp9], 0x00(%[sse]) \n\t"
: [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
@@ -925,6 +938,7 @@ static inline uint32_t vpx_mse16x(const uint8_t *src_ptr, int src_stride,
[high]"r"(&high), [sse]"r"(sse)
: "memory"
);
+ /* clang-format on */
return *sse;
}
@@ -947,12 +961,13 @@ static inline uint32_t vpx_mse8x(const uint8_t *src_ptr, int src_stride,
*sse = 0;
+ /* clang-format off */
__asm__ volatile (
"li %[tmp0], 0x20 \n\t"
"mtc1 %[tmp0], %[ftmp11] \n\t"
MMI_L(%[tmp0], %[high], 0x00)
- "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
- "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
"1: \n\t"
VARIANCE_SSE_8
@@ -962,7 +977,7 @@ static inline uint32_t vpx_mse8x(const uint8_t *src_ptr, int src_stride,
MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
"bnez %[tmp0], 1b \n\t"
- "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
+ "ssrld %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
"paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t"
"swc1 %[ftmp9], 0x00(%[sse]) \n\t"
: [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
@@ -978,6 +993,7 @@ static inline uint32_t vpx_mse8x(const uint8_t *src_ptr, int src_stride,
[high]"r"(&high), [sse]"r"(sse)
: "memory"
);
+ /* clang-format on */
return *sse;
}
@@ -1021,22 +1037,39 @@ static inline void var_filter_block2d_bil_16x(const uint8_t *src_ptr,
uint8_t *temp2_ptr = temp2;
mips_reg l_counter = counter;
double ftmp[15];
+ double ff_ph_40, mask;
+ double filter_x0, filter_x1, filter_y0, filter_y1;
mips_reg tmp[2];
- DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
- DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
+ uint64_t x0, x1, y0, y1, all;
const uint8_t *filter_x = bilinear_filters[x_offset];
const uint8_t *filter_y = bilinear_filters[y_offset];
+ x0 = (uint64_t)filter_x[0];
+ x1 = (uint64_t)filter_x[1];
+ y0 = (uint64_t)filter_y[0];
+ y1 = (uint64_t)filter_y[1];
+ all = x0 | x1 << 8 | y0 << 16 | y1 << 24;
+ /* clang-format off */
__asm__ volatile (
- "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ MMI_MTC1(%[all], %[ftmp14])
+ "punpcklbh %[ftmp14], %[ftmp14], %[ftmp0] \n\t"
+ "pshufh %[filter_x0], %[ftmp14], %[ftmp0] \n\t"
+ MMI_LI(%[tmp0], 0x10)
+ MMI_MTC1(%[tmp0], %[mask])
+ "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t"
+ "pshufh %[filter_x1], %[ftmp14], %[ftmp0] \n\t"
+ "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t"
+ "pshufh %[filter_y0], %[ftmp14], %[ftmp0] \n\t"
+ "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t"
+ "pshufh %[filter_y1], %[ftmp14], %[ftmp0] \n\t"
MMI_LI(%[tmp0], 0x07)
MMI_MTC1(%[tmp0], %[ftmp14])
- "pshufh %[filter_x0], %[filter_x0], %[ftmp0] \n\t"
- "pshufh %[filter_x1], %[filter_x1], %[ftmp0] \n\t"
- "pshufh %[filter_y0], %[filter_y0], %[ftmp0] \n\t"
- "pshufh %[filter_y1], %[filter_y1], %[ftmp0] \n\t"
-
+ MMI_LI(%[tmp0], 0x0040004000400040)
+ MMI_MTC1(%[tmp0], %[ff_ph_40])
+ MMI_LI(%[tmp0], 0x00ff00ff00ff00ff)
+ MMI_MTC1(%[tmp0], %[mask])
// fdata3: fdata3[0] ~ fdata3[15]
VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
@@ -1072,15 +1105,13 @@ static inline void var_filter_block2d_bil_16x(const uint8_t *src_ptr,
[ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
[ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]),
[tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr),
- [counter]"+&r"(l_counter)
- : [filter_x0] "f"((uint64_t)filter_x[0]),
- [filter_x1] "f"((uint64_t)filter_x[1]),
- [filter_y0] "f"((uint64_t)filter_y[0]),
- [filter_y1] "f"((uint64_t)filter_y[1]),
- [src_stride] "r"((mips_reg)src_stride), [ff_ph_40] "f"(ff_ph_40),
- [mask] "f"(mask)
+ [counter]"+&r"(l_counter), [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask),
+ [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1),
+ [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1)
+ : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all)
: "memory"
);
+ /* clang-format on */
}
#define SUBPIX_VAR16XN(H) \
@@ -1105,19 +1136,38 @@ static inline void var_filter_block2d_bil_8x(const uint8_t *src_ptr,
mips_reg l_counter = counter;
double ftmp[15];
mips_reg tmp[2];
- DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
- DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
+ double ff_ph_40, mask;
+ uint64_t x0, x1, y0, y1, all;
+ double filter_x0, filter_x1, filter_y0, filter_y1;
const uint8_t *filter_x = bilinear_filters[x_offset];
const uint8_t *filter_y = bilinear_filters[y_offset];
+ x0 = (uint64_t)filter_x[0];
+ x1 = (uint64_t)filter_x[1];
+ y0 = (uint64_t)filter_y[0];
+ y1 = (uint64_t)filter_y[1];
+ all = x0 | x1 << 8 | y0 << 16 | y1 << 24;
+ /* clang-format off */
__asm__ volatile (
- "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ MMI_MTC1(%[all], %[ftmp14])
+ "punpcklbh %[ftmp14], %[ftmp14], %[ftmp0] \n\t"
+ "pshufh %[filter_x0], %[ftmp14], %[ftmp0] \n\t"
+ MMI_LI(%[tmp0], 0x10)
+ MMI_MTC1(%[tmp0], %[mask])
+ "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t"
+ "pshufh %[filter_x1], %[ftmp14], %[ftmp0] \n\t"
+ "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t"
+ "pshufh %[filter_y0], %[ftmp14], %[ftmp0] \n\t"
+ "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t"
+ "pshufh %[filter_y1], %[ftmp14], %[ftmp0] \n\t"
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
MMI_LI(%[tmp0], 0x07)
MMI_MTC1(%[tmp0], %[ftmp14])
- "pshufh %[filter_x0], %[filter_x0], %[ftmp0] \n\t"
- "pshufh %[filter_x1], %[filter_x1], %[ftmp0] \n\t"
- "pshufh %[filter_y0], %[filter_y0], %[ftmp0] \n\t"
- "pshufh %[filter_y1], %[filter_y1], %[ftmp0] \n\t"
+ MMI_LI(%[tmp0], 0x0040004000400040)
+ MMI_MTC1(%[tmp0], %[ff_ph_40])
+ MMI_LI(%[tmp0], 0x00ff00ff00ff00ff)
+ MMI_MTC1(%[tmp0], %[mask])
// fdata3: fdata3[0] ~ fdata3[7]
VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
@@ -1154,15 +1204,13 @@ static inline void var_filter_block2d_bil_8x(const uint8_t *src_ptr,
[ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
[ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]),
[tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr),
- [counter]"+&r"(l_counter)
- : [filter_x0] "f"((uint64_t)filter_x[0]),
- [filter_x1] "f"((uint64_t)filter_x[1]),
- [filter_y0] "f"((uint64_t)filter_y[0]),
- [filter_y1] "f"((uint64_t)filter_y[1]),
- [src_stride] "r"((mips_reg)src_stride), [ff_ph_40] "f"(ff_ph_40),
- [mask] "f"(mask)
+ [counter]"+&r"(l_counter), [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask),
+ [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1),
+ [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1)
+ : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all)
: "memory"
);
+ /* clang-format on */
}
#define SUBPIX_VAR8XN(H) \
@@ -1188,19 +1236,38 @@ static inline void var_filter_block2d_bil_4x(const uint8_t *src_ptr,
mips_reg l_counter = counter;
double ftmp[7];
mips_reg tmp[2];
- DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
- DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
+ double ff_ph_40, mask;
+ uint64_t x0, x1, y0, y1, all;
+ double filter_x0, filter_x1, filter_y0, filter_y1;
const uint8_t *filter_x = bilinear_filters[x_offset];
const uint8_t *filter_y = bilinear_filters[y_offset];
+ x0 = (uint64_t)filter_x[0];
+ x1 = (uint64_t)filter_x[1];
+ y0 = (uint64_t)filter_y[0];
+ y1 = (uint64_t)filter_y[1];
+ all = x0 | x1 << 8 | y0 << 16 | y1 << 24;
+ /* clang-format off */
__asm__ volatile (
- "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ MMI_MTC1(%[all], %[ftmp6])
+ "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
+ "pshufh %[filter_x0], %[ftmp6], %[ftmp0] \n\t"
+ MMI_LI(%[tmp0], 0x10)
+ MMI_MTC1(%[tmp0], %[mask])
+ "ssrld %[ftmp6], %[ftmp6], %[mask] \n\t"
+ "pshufh %[filter_x1], %[ftmp6], %[ftmp0] \n\t"
+ "ssrld %[ftmp6], %[ftmp6], %[mask] \n\t"
+ "pshufh %[filter_y0], %[ftmp6], %[ftmp0] \n\t"
+ "ssrld %[ftmp6], %[ftmp6], %[mask] \n\t"
+ "pshufh %[filter_y1], %[ftmp6], %[ftmp0] \n\t"
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
MMI_LI(%[tmp0], 0x07)
MMI_MTC1(%[tmp0], %[ftmp6])
- "pshufh %[filter_x0], %[filter_x0], %[ftmp0] \n\t"
- "pshufh %[filter_x1], %[filter_x1], %[ftmp0] \n\t"
- "pshufh %[filter_y0], %[filter_y0], %[ftmp0] \n\t"
- "pshufh %[filter_y1], %[filter_y1], %[ftmp0] \n\t"
+ MMI_LI(%[tmp0], 0x0040004000400040)
+ MMI_MTC1(%[tmp0], %[ff_ph_40])
+ MMI_LI(%[tmp0], 0x00ff00ff00ff00ff)
+ MMI_MTC1(%[tmp0], %[mask])
// fdata3: fdata3[0] ~ fdata3[3]
VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
@@ -1232,15 +1299,14 @@ static inline void var_filter_block2d_bil_4x(const uint8_t *src_ptr,
: [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
[ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
[ftmp6] "=&f"(ftmp[6]), [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr),
- [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter)
- : [filter_x0] "f"((uint64_t)filter_x[0]),
- [filter_x1] "f"((uint64_t)filter_x[1]),
- [filter_y0] "f"((uint64_t)filter_y[0]),
- [filter_y1] "f"((uint64_t)filter_y[1]),
- [src_stride] "r"((mips_reg)src_stride), [ff_ph_40] "f"(ff_ph_40),
- [mask] "f"(mask)
+ [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter),
+ [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask),
+ [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1),
+ [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1)
+ : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all)
: "memory"
);
+ /* clang-format on */
}
#define SUBPIX_VAR4XN(H) \
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/mips/vpx_convolve8_mmi.c b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/mips/vpx_convolve8_mmi.c
index ba9ceb866..cb7bca558 100644
--- a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/mips/vpx_convolve8_mmi.c
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/mips/vpx_convolve8_mmi.c
@@ -105,7 +105,7 @@ static void convolve_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride,
/* clang-format off */
__asm__ volatile(
"move %[tmp1], %[width] \n\t"
- "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
"gsldlc1 %[filter1], 0x03(%[filter]) \n\t"
"gsldrc1 %[filter1], 0x00(%[filter]) \n\t"
"gsldlc1 %[filter2], 0x0b(%[filter]) \n\t"
@@ -178,7 +178,7 @@ static void convolve_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
(void)y_step_q4;
__asm__ volatile(
- "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
"gsldlc1 %[ftmp4], 0x03(%[filter]) \n\t"
"gsldrc1 %[ftmp4], 0x00(%[filter]) \n\t"
"gsldlc1 %[ftmp5], 0x0b(%[filter]) \n\t"
@@ -271,7 +271,7 @@ static void convolve_avg_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride,
__asm__ volatile(
"move %[tmp1], %[width] \n\t"
- "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
"gsldlc1 %[filter1], 0x03(%[filter]) \n\t"
"gsldrc1 %[filter1], 0x00(%[filter]) \n\t"
"gsldlc1 %[filter2], 0x0b(%[filter]) \n\t"
@@ -354,7 +354,7 @@ static void convolve_avg_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
(void)y_step_q4;
__asm__ volatile(
- "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
"gsldlc1 %[ftmp4], 0x03(%[filter]) \n\t"
"gsldrc1 %[ftmp4], 0x00(%[filter]) \n\t"
"gsldlc1 %[ftmp5], 0x0b(%[filter]) \n\t"
@@ -467,7 +467,7 @@ void vpx_convolve_avg_mmi(const uint8_t *src, ptrdiff_t src_stride,
__asm__ volatile(
"move %[tmp1], %[width] \n\t"
- "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
"li %[tmp0], 0x10001 \n\t"
MMI_MTC1(%[tmp0], %[ftmp3])
"punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/add_noise_sse2.asm b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/add_noise_sse2.asm
index 80cced4ce..f51718cf9 100644
--- a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/add_noise_sse2.asm
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/add_noise_sse2.asm
@@ -16,7 +16,7 @@ SECTION .text
;void vpx_plane_add_noise_sse2(uint8_t *start, const int8_t *noise,
; int blackclamp, int whiteclamp,
; int width, int height, int pitch)
-global sym(vpx_plane_add_noise_sse2) PRIVATE
+globalsym(vpx_plane_add_noise_sse2)
sym(vpx_plane_add_noise_sse2):
push rbp
mov rbp, rsp
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/deblock_sse2.asm b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/deblock_sse2.asm
index 9d8e5e3e0..b3af677d2 100644
--- a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/deblock_sse2.asm
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/deblock_sse2.asm
@@ -95,7 +95,7 @@ SECTION .text
; int *flimits,
; int size
;)
-global sym(vpx_post_proc_down_and_across_mb_row_sse2) PRIVATE
+globalsym(vpx_post_proc_down_and_across_mb_row_sse2)
sym(vpx_post_proc_down_and_across_mb_row_sse2):
push rbp
mov rbp, rsp
@@ -235,7 +235,7 @@ sym(vpx_post_proc_down_and_across_mb_row_sse2):
;void vpx_mbpost_proc_across_ip_sse2(unsigned char *src,
; int pitch, int rows, int cols,int flimit)
-global sym(vpx_mbpost_proc_across_ip_sse2) PRIVATE
+globalsym(vpx_mbpost_proc_across_ip_sse2)
sym(vpx_mbpost_proc_across_ip_sse2):
push rbp
mov rbp, rsp
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm
index a256a59ec..5bee51fa0 100644
--- a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm
@@ -22,7 +22,7 @@ SECTION .text
; unsigned int * SSE,
; int * Sum
;)
-global sym(vpx_highbd_calc16x16var_sse2) PRIVATE
+globalsym(vpx_highbd_calc16x16var_sse2)
sym(vpx_highbd_calc16x16var_sse2):
push rbp
mov rbp, rsp
@@ -175,7 +175,7 @@ sym(vpx_highbd_calc16x16var_sse2):
; unsigned int * SSE,
; int * Sum
;)
-global sym(vpx_highbd_calc8x8var_sse2) PRIVATE
+globalsym(vpx_highbd_calc8x8var_sse2)
sym(vpx_highbd_calc8x8var_sse2):
push rbp
mov rbp, rsp
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_sse3.asm b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_sse3.asm
index 175dcc089..acbd2e4fa 100644
--- a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_sse3.asm
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_sse3.asm
@@ -173,7 +173,7 @@ SECTION .text
; unsigned char *ref_ptr,
; int ref_stride,
; int *results)
-global sym(vpx_sad16x16x3_sse3) PRIVATE
+globalsym(vpx_sad16x16x3_sse3)
sym(vpx_sad16x16x3_sse3):
STACK_FRAME_CREATE_X3
@@ -215,7 +215,7 @@ sym(vpx_sad16x16x3_sse3):
; unsigned char *ref_ptr,
; int ref_stride,
; int *results)
-global sym(vpx_sad16x8x3_sse3) PRIVATE
+globalsym(vpx_sad16x8x3_sse3)
sym(vpx_sad16x8x3_sse3):
STACK_FRAME_CREATE_X3
@@ -253,7 +253,7 @@ sym(vpx_sad16x8x3_sse3):
; unsigned char *ref_ptr,
; int ref_stride,
; int *results)
-global sym(vpx_sad8x16x3_sse3) PRIVATE
+globalsym(vpx_sad8x16x3_sse3)
sym(vpx_sad8x16x3_sse3):
STACK_FRAME_CREATE_X3
@@ -282,7 +282,7 @@ sym(vpx_sad8x16x3_sse3):
; unsigned char *ref_ptr,
; int ref_stride,
; int *results)
-global sym(vpx_sad8x8x3_sse3) PRIVATE
+globalsym(vpx_sad8x8x3_sse3)
sym(vpx_sad8x8x3_sse3):
STACK_FRAME_CREATE_X3
@@ -307,7 +307,7 @@ sym(vpx_sad8x8x3_sse3):
; unsigned char *ref_ptr,
; int ref_stride,
; int *results)
-global sym(vpx_sad4x4x3_sse3) PRIVATE
+globalsym(vpx_sad4x4x3_sse3)
sym(vpx_sad4x4x3_sse3):
STACK_FRAME_CREATE_X3
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_sse4.asm b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_sse4.asm
index 03999dfca..0818ed5f0 100644
--- a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_sse4.asm
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_sse4.asm
@@ -173,7 +173,7 @@ SECTION .text
; const unsigned char *ref_ptr,
; int ref_stride,
; unsigned short *sad_array);
-global sym(vpx_sad16x16x8_sse4_1) PRIVATE
+globalsym(vpx_sad16x16x8_sse4_1)
sym(vpx_sad16x16x8_sse4_1):
push rbp
mov rbp, rsp
@@ -214,7 +214,7 @@ sym(vpx_sad16x16x8_sse4_1):
; int ref_stride,
; unsigned short *sad_array
;);
-global sym(vpx_sad16x8x8_sse4_1) PRIVATE
+globalsym(vpx_sad16x8x8_sse4_1)
sym(vpx_sad16x8x8_sse4_1):
push rbp
mov rbp, rsp
@@ -251,7 +251,7 @@ sym(vpx_sad16x8x8_sse4_1):
; int ref_stride,
; unsigned short *sad_array
;);
-global sym(vpx_sad8x8x8_sse4_1) PRIVATE
+globalsym(vpx_sad8x8x8_sse4_1)
sym(vpx_sad8x8x8_sse4_1):
push rbp
mov rbp, rsp
@@ -288,7 +288,7 @@ sym(vpx_sad8x8x8_sse4_1):
; int ref_stride,
; unsigned short *sad_array
;);
-global sym(vpx_sad8x16x8_sse4_1) PRIVATE
+globalsym(vpx_sad8x16x8_sse4_1)
sym(vpx_sad8x16x8_sse4_1):
push rbp
mov rbp, rsp
@@ -329,7 +329,7 @@ sym(vpx_sad8x16x8_sse4_1):
; int ref_stride,
; unsigned short *sad_array
;);
-global sym(vpx_sad4x4x8_sse4_1) PRIVATE
+globalsym(vpx_sad4x4x8_sse4_1)
sym(vpx_sad4x4x8_sse4_1):
push rbp
mov rbp, rsp
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_ssse3.asm b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_ssse3.asm
index 7cf93cf51..a5bc6d730 100644
--- a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_ssse3.asm
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_ssse3.asm
@@ -154,7 +154,7 @@ SECTION .text
; unsigned char *ref_ptr,
; int ref_stride,
; int *results)
-global sym(vpx_sad16x16x3_ssse3) PRIVATE
+globalsym(vpx_sad16x16x3_ssse3)
sym(vpx_sad16x16x3_ssse3):
push rbp
mov rbp, rsp
@@ -267,7 +267,7 @@ sym(vpx_sad16x16x3_ssse3):
; unsigned char *ref_ptr,
; int ref_stride,
; int *results)
-global sym(vpx_sad16x8x3_ssse3) PRIVATE
+globalsym(vpx_sad16x8x3_ssse3)
sym(vpx_sad16x8x3_ssse3):
push rbp
mov rbp, rsp
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm
index 300fa8aab..41ffbb07e 100644
--- a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm
@@ -64,7 +64,7 @@ SECTION .text
; or pavgb At this point this is just meant to be first pass for calculating
; all the parms needed for 16x16 ssim so we can play with dssim as distortion
; in mode selection code.
-global sym(vpx_ssim_parms_16x16_sse2) PRIVATE
+globalsym(vpx_ssim_parms_16x16_sse2)
sym(vpx_ssim_parms_16x16_sse2):
push rbp
mov rbp, rsp
@@ -154,7 +154,7 @@ sym(vpx_ssim_parms_16x16_sse2):
; or pavgb At this point this is just meant to be first pass for calculating
; all the parms needed for 16x16 ssim so we can play with dssim as distortion
; in mode selection code.
-global sym(vpx_ssim_parms_8x8_sse2) PRIVATE
+globalsym(vpx_ssim_parms_8x8_sse2)
sym(vpx_ssim_parms_8x8_sse2):
push rbp
mov rbp, rsp
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/transpose_sse2.h b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/transpose_sse2.h
index 6e07871b1..b4f1190d7 100644
--- a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/transpose_sse2.h
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/transpose_sse2.h
@@ -16,7 +16,7 @@
#include "./vpx_config.h"
static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) {
- // Unpack 16 bit elements. Goes from:
+ // Unpack 8 bit elements. Goes from:
// in[0]: 00 01 02 03
// in[1]: 10 11 12 13
// in[2]: 20 21 22 23
@@ -27,7 +27,7 @@ static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) {
const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
- // Unpack 32 bit elements resulting in:
+ // Unpack 16 bit elements resulting in:
// 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
return _mm_unpacklo_epi16(a0, a1);
}
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm
index c57149657..fc301fb39 100644
--- a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm
@@ -208,7 +208,7 @@ SECTION .text
; unsigned int output_height,
; short *filter
;)
-global sym(vpx_highbd_filter_block1d4_v8_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d4_v8_sse2)
sym(vpx_highbd_filter_block1d4_v8_sse2):
push rbp
mov rbp, rsp
@@ -278,7 +278,7 @@ sym(vpx_highbd_filter_block1d4_v8_sse2):
; unsigned int output_height,
; short *filter
;)
-global sym(vpx_highbd_filter_block1d8_v8_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d8_v8_sse2)
sym(vpx_highbd_filter_block1d8_v8_sse2):
push rbp
mov rbp, rsp
@@ -337,7 +337,7 @@ sym(vpx_highbd_filter_block1d8_v8_sse2):
; unsigned int output_height,
; short *filter
;)
-global sym(vpx_highbd_filter_block1d16_v8_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d16_v8_sse2)
sym(vpx_highbd_filter_block1d16_v8_sse2):
push rbp
mov rbp, rsp
@@ -391,7 +391,7 @@ sym(vpx_highbd_filter_block1d16_v8_sse2):
pop rbp
ret
-global sym(vpx_highbd_filter_block1d4_v8_avg_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d4_v8_avg_sse2)
sym(vpx_highbd_filter_block1d4_v8_avg_sse2):
push rbp
mov rbp, rsp
@@ -452,7 +452,7 @@ sym(vpx_highbd_filter_block1d4_v8_avg_sse2):
pop rbp
ret
-global sym(vpx_highbd_filter_block1d8_v8_avg_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d8_v8_avg_sse2)
sym(vpx_highbd_filter_block1d8_v8_avg_sse2):
push rbp
mov rbp, rsp
@@ -501,7 +501,7 @@ sym(vpx_highbd_filter_block1d8_v8_avg_sse2):
pop rbp
ret
-global sym(vpx_highbd_filter_block1d16_v8_avg_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d16_v8_avg_sse2)
sym(vpx_highbd_filter_block1d16_v8_avg_sse2):
push rbp
mov rbp, rsp
@@ -563,7 +563,7 @@ sym(vpx_highbd_filter_block1d16_v8_avg_sse2):
; unsigned int output_height,
; short *filter
;)
-global sym(vpx_highbd_filter_block1d4_h8_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d4_h8_sse2)
sym(vpx_highbd_filter_block1d4_h8_sse2):
push rbp
mov rbp, rsp
@@ -638,7 +638,7 @@ sym(vpx_highbd_filter_block1d4_h8_sse2):
; unsigned int output_height,
; short *filter
;)
-global sym(vpx_highbd_filter_block1d8_h8_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d8_h8_sse2)
sym(vpx_highbd_filter_block1d8_h8_sse2):
push rbp
mov rbp, rsp
@@ -704,7 +704,7 @@ sym(vpx_highbd_filter_block1d8_h8_sse2):
; unsigned int output_height,
; short *filter
;)
-global sym(vpx_highbd_filter_block1d16_h8_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d16_h8_sse2)
sym(vpx_highbd_filter_block1d16_h8_sse2):
push rbp
mov rbp, rsp
@@ -772,7 +772,7 @@ sym(vpx_highbd_filter_block1d16_h8_sse2):
pop rbp
ret
-global sym(vpx_highbd_filter_block1d4_h8_avg_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d4_h8_avg_sse2)
sym(vpx_highbd_filter_block1d4_h8_avg_sse2):
push rbp
mov rbp, rsp
@@ -838,7 +838,7 @@ sym(vpx_highbd_filter_block1d4_h8_avg_sse2):
pop rbp
ret
-global sym(vpx_highbd_filter_block1d8_h8_avg_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d8_h8_avg_sse2)
sym(vpx_highbd_filter_block1d8_h8_avg_sse2):
push rbp
mov rbp, rsp
@@ -895,7 +895,7 @@ sym(vpx_highbd_filter_block1d8_h8_avg_sse2):
pop rbp
ret
-global sym(vpx_highbd_filter_block1d16_h8_avg_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d16_h8_avg_sse2)
sym(vpx_highbd_filter_block1d16_h8_avg_sse2):
push rbp
mov rbp, rsp
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm
index ec18d370e..bd51c75bc 100644
--- a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm
@@ -173,7 +173,7 @@
SECTION .text
-global sym(vpx_highbd_filter_block1d4_v2_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d4_v2_sse2)
sym(vpx_highbd_filter_block1d4_v2_sse2):
push rbp
mov rbp, rsp
@@ -198,7 +198,7 @@ sym(vpx_highbd_filter_block1d4_v2_sse2):
ret
%if VPX_ARCH_X86_64
-global sym(vpx_highbd_filter_block1d8_v2_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d8_v2_sse2)
sym(vpx_highbd_filter_block1d8_v2_sse2):
push rbp
mov rbp, rsp
@@ -224,7 +224,7 @@ sym(vpx_highbd_filter_block1d8_v2_sse2):
pop rbp
ret
-global sym(vpx_highbd_filter_block1d16_v2_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d16_v2_sse2)
sym(vpx_highbd_filter_block1d16_v2_sse2):
push rbp
mov rbp, rsp
@@ -253,7 +253,7 @@ sym(vpx_highbd_filter_block1d16_v2_sse2):
ret
%endif
-global sym(vpx_highbd_filter_block1d4_v2_avg_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d4_v2_avg_sse2)
sym(vpx_highbd_filter_block1d4_v2_avg_sse2):
push rbp
mov rbp, rsp
@@ -278,7 +278,7 @@ sym(vpx_highbd_filter_block1d4_v2_avg_sse2):
ret
%if VPX_ARCH_X86_64
-global sym(vpx_highbd_filter_block1d8_v2_avg_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d8_v2_avg_sse2)
sym(vpx_highbd_filter_block1d8_v2_avg_sse2):
push rbp
mov rbp, rsp
@@ -304,7 +304,7 @@ sym(vpx_highbd_filter_block1d8_v2_avg_sse2):
pop rbp
ret
-global sym(vpx_highbd_filter_block1d16_v2_avg_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d16_v2_avg_sse2)
sym(vpx_highbd_filter_block1d16_v2_avg_sse2):
push rbp
mov rbp, rsp
@@ -333,7 +333,7 @@ sym(vpx_highbd_filter_block1d16_v2_avg_sse2):
ret
%endif
-global sym(vpx_highbd_filter_block1d4_h2_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d4_h2_sse2)
sym(vpx_highbd_filter_block1d4_h2_sse2):
push rbp
mov rbp, rsp
@@ -359,7 +359,7 @@ sym(vpx_highbd_filter_block1d4_h2_sse2):
ret
%if VPX_ARCH_X86_64
-global sym(vpx_highbd_filter_block1d8_h2_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d8_h2_sse2)
sym(vpx_highbd_filter_block1d8_h2_sse2):
push rbp
mov rbp, rsp
@@ -385,7 +385,7 @@ sym(vpx_highbd_filter_block1d8_h2_sse2):
pop rbp
ret
-global sym(vpx_highbd_filter_block1d16_h2_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d16_h2_sse2)
sym(vpx_highbd_filter_block1d16_h2_sse2):
push rbp
mov rbp, rsp
@@ -414,7 +414,7 @@ sym(vpx_highbd_filter_block1d16_h2_sse2):
ret
%endif
-global sym(vpx_highbd_filter_block1d4_h2_avg_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d4_h2_avg_sse2)
sym(vpx_highbd_filter_block1d4_h2_avg_sse2):
push rbp
mov rbp, rsp
@@ -440,7 +440,7 @@ sym(vpx_highbd_filter_block1d4_h2_avg_sse2):
ret
%if VPX_ARCH_X86_64
-global sym(vpx_highbd_filter_block1d8_h2_avg_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d8_h2_avg_sse2)
sym(vpx_highbd_filter_block1d8_h2_avg_sse2):
push rbp
mov rbp, rsp
@@ -466,7 +466,7 @@ sym(vpx_highbd_filter_block1d8_h2_avg_sse2):
pop rbp
ret
-global sym(vpx_highbd_filter_block1d16_h2_avg_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d16_h2_avg_sse2)
sym(vpx_highbd_filter_block1d16_h2_avg_sse2):
push rbp
mov rbp, rsp
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm
index 8497e1721..c8455e13a 100644
--- a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm
@@ -187,7 +187,7 @@ SECTION .text
; unsigned int output_height,
; short *filter
;)
-global sym(vpx_filter_block1d4_v8_sse2) PRIVATE
+globalsym(vpx_filter_block1d4_v8_sse2)
sym(vpx_filter_block1d4_v8_sse2):
push rbp
mov rbp, rsp
@@ -254,7 +254,7 @@ sym(vpx_filter_block1d4_v8_sse2):
; unsigned int output_height,
; short *filter
;)
-global sym(vpx_filter_block1d8_v8_sse2) PRIVATE
+globalsym(vpx_filter_block1d8_v8_sse2)
sym(vpx_filter_block1d8_v8_sse2):
push rbp
mov rbp, rsp
@@ -313,7 +313,7 @@ sym(vpx_filter_block1d8_v8_sse2):
; unsigned int output_height,
; short *filter
;)
-global sym(vpx_filter_block1d16_v8_sse2) PRIVATE
+globalsym(vpx_filter_block1d16_v8_sse2)
sym(vpx_filter_block1d16_v8_sse2):
push rbp
mov rbp, rsp
@@ -367,7 +367,7 @@ sym(vpx_filter_block1d16_v8_sse2):
pop rbp
ret
-global sym(vpx_filter_block1d4_v8_avg_sse2) PRIVATE
+globalsym(vpx_filter_block1d4_v8_avg_sse2)
sym(vpx_filter_block1d4_v8_avg_sse2):
push rbp
mov rbp, rsp
@@ -425,7 +425,7 @@ sym(vpx_filter_block1d4_v8_avg_sse2):
pop rbp
ret
-global sym(vpx_filter_block1d8_v8_avg_sse2) PRIVATE
+globalsym(vpx_filter_block1d8_v8_avg_sse2)
sym(vpx_filter_block1d8_v8_avg_sse2):
push rbp
mov rbp, rsp
@@ -474,7 +474,7 @@ sym(vpx_filter_block1d8_v8_avg_sse2):
pop rbp
ret
-global sym(vpx_filter_block1d16_v8_avg_sse2) PRIVATE
+globalsym(vpx_filter_block1d16_v8_avg_sse2)
sym(vpx_filter_block1d16_v8_avg_sse2):
push rbp
mov rbp, rsp
@@ -536,7 +536,7 @@ sym(vpx_filter_block1d16_v8_avg_sse2):
; unsigned int output_height,
; short *filter
;)
-global sym(vpx_filter_block1d4_h8_sse2) PRIVATE
+globalsym(vpx_filter_block1d4_h8_sse2)
sym(vpx_filter_block1d4_h8_sse2):
push rbp
mov rbp, rsp
@@ -610,7 +610,7 @@ sym(vpx_filter_block1d4_h8_sse2):
; unsigned int output_height,
; short *filter
;)
-global sym(vpx_filter_block1d8_h8_sse2) PRIVATE
+globalsym(vpx_filter_block1d8_h8_sse2)
sym(vpx_filter_block1d8_h8_sse2):
push rbp
mov rbp, rsp
@@ -685,7 +685,7 @@ sym(vpx_filter_block1d8_h8_sse2):
; unsigned int output_height,
; short *filter
;)
-global sym(vpx_filter_block1d16_h8_sse2) PRIVATE
+globalsym(vpx_filter_block1d16_h8_sse2)
sym(vpx_filter_block1d16_h8_sse2):
push rbp
mov rbp, rsp
@@ -771,7 +771,7 @@ sym(vpx_filter_block1d16_h8_sse2):
pop rbp
ret
-global sym(vpx_filter_block1d4_h8_avg_sse2) PRIVATE
+globalsym(vpx_filter_block1d4_h8_avg_sse2)
sym(vpx_filter_block1d4_h8_avg_sse2):
push rbp
mov rbp, rsp
@@ -836,7 +836,7 @@ sym(vpx_filter_block1d4_h8_avg_sse2):
pop rbp
ret
-global sym(vpx_filter_block1d8_h8_avg_sse2) PRIVATE
+globalsym(vpx_filter_block1d8_h8_avg_sse2)
sym(vpx_filter_block1d8_h8_avg_sse2):
push rbp
mov rbp, rsp
@@ -902,7 +902,7 @@ sym(vpx_filter_block1d8_h8_avg_sse2):
pop rbp
ret
-global sym(vpx_filter_block1d16_h8_avg_sse2) PRIVATE
+globalsym(vpx_filter_block1d16_h8_avg_sse2)
sym(vpx_filter_block1d16_h8_avg_sse2):
push rbp
mov rbp, rsp
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm
index 6d79492e4..65790b1c2 100644
--- a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm
@@ -133,7 +133,7 @@
SECTION .text
-global sym(vpx_filter_block1d4_v2_sse2) PRIVATE
+globalsym(vpx_filter_block1d4_v2_sse2)
sym(vpx_filter_block1d4_v2_sse2):
push rbp
mov rbp, rsp
@@ -157,7 +157,7 @@ sym(vpx_filter_block1d4_v2_sse2):
pop rbp
ret
-global sym(vpx_filter_block1d8_v2_sse2) PRIVATE
+globalsym(vpx_filter_block1d8_v2_sse2)
sym(vpx_filter_block1d8_v2_sse2):
push rbp
mov rbp, rsp
@@ -183,7 +183,7 @@ sym(vpx_filter_block1d8_v2_sse2):
pop rbp
ret
-global sym(vpx_filter_block1d16_v2_sse2) PRIVATE
+globalsym(vpx_filter_block1d16_v2_sse2)
sym(vpx_filter_block1d16_v2_sse2):
push rbp
mov rbp, rsp
@@ -211,7 +211,7 @@ sym(vpx_filter_block1d16_v2_sse2):
pop rbp
ret
-global sym(vpx_filter_block1d4_v2_avg_sse2) PRIVATE
+globalsym(vpx_filter_block1d4_v2_avg_sse2)
sym(vpx_filter_block1d4_v2_avg_sse2):
push rbp
mov rbp, rsp
@@ -235,7 +235,7 @@ sym(vpx_filter_block1d4_v2_avg_sse2):
pop rbp
ret
-global sym(vpx_filter_block1d8_v2_avg_sse2) PRIVATE
+globalsym(vpx_filter_block1d8_v2_avg_sse2)
sym(vpx_filter_block1d8_v2_avg_sse2):
push rbp
mov rbp, rsp
@@ -261,7 +261,7 @@ sym(vpx_filter_block1d8_v2_avg_sse2):
pop rbp
ret
-global sym(vpx_filter_block1d16_v2_avg_sse2) PRIVATE
+globalsym(vpx_filter_block1d16_v2_avg_sse2)
sym(vpx_filter_block1d16_v2_avg_sse2):
push rbp
mov rbp, rsp
@@ -289,7 +289,7 @@ sym(vpx_filter_block1d16_v2_avg_sse2):
pop rbp
ret
-global sym(vpx_filter_block1d4_h2_sse2) PRIVATE
+globalsym(vpx_filter_block1d4_h2_sse2)
sym(vpx_filter_block1d4_h2_sse2):
push rbp
mov rbp, rsp
@@ -314,7 +314,7 @@ sym(vpx_filter_block1d4_h2_sse2):
pop rbp
ret
-global sym(vpx_filter_block1d8_h2_sse2) PRIVATE
+globalsym(vpx_filter_block1d8_h2_sse2)
sym(vpx_filter_block1d8_h2_sse2):
push rbp
mov rbp, rsp
@@ -341,7 +341,7 @@ sym(vpx_filter_block1d8_h2_sse2):
pop rbp
ret
-global sym(vpx_filter_block1d16_h2_sse2) PRIVATE
+globalsym(vpx_filter_block1d16_h2_sse2)
sym(vpx_filter_block1d16_h2_sse2):
push rbp
mov rbp, rsp
@@ -369,7 +369,7 @@ sym(vpx_filter_block1d16_h2_sse2):
pop rbp
ret
-global sym(vpx_filter_block1d4_h2_avg_sse2) PRIVATE
+globalsym(vpx_filter_block1d4_h2_avg_sse2)
sym(vpx_filter_block1d4_h2_avg_sse2):
push rbp
mov rbp, rsp
@@ -394,7 +394,7 @@ sym(vpx_filter_block1d4_h2_avg_sse2):
pop rbp
ret
-global sym(vpx_filter_block1d8_h2_avg_sse2) PRIVATE
+globalsym(vpx_filter_block1d8_h2_avg_sse2)
sym(vpx_filter_block1d8_h2_avg_sse2):
push rbp
mov rbp, rsp
@@ -421,7 +421,7 @@ sym(vpx_filter_block1d8_h2_avg_sse2):
pop rbp
ret
-global sym(vpx_filter_block1d16_h2_avg_sse2) PRIVATE
+globalsym(vpx_filter_block1d16_h2_avg_sse2)
sym(vpx_filter_block1d16_h2_avg_sse2):
push rbp
mov rbp, rsp
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm
index 8c9c817be..32e3cd3d9 100644
--- a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm
@@ -107,7 +107,7 @@
SECTION .text
-global sym(vpx_filter_block1d4_v2_ssse3) PRIVATE
+globalsym(vpx_filter_block1d4_v2_ssse3)
sym(vpx_filter_block1d4_v2_ssse3):
push rbp
mov rbp, rsp
@@ -131,7 +131,7 @@ sym(vpx_filter_block1d4_v2_ssse3):
pop rbp
ret
-global sym(vpx_filter_block1d8_v2_ssse3) PRIVATE
+globalsym(vpx_filter_block1d8_v2_ssse3)
sym(vpx_filter_block1d8_v2_ssse3):
push rbp
mov rbp, rsp
@@ -157,7 +157,7 @@ sym(vpx_filter_block1d8_v2_ssse3):
pop rbp
ret
-global sym(vpx_filter_block1d16_v2_ssse3) PRIVATE
+globalsym(vpx_filter_block1d16_v2_ssse3)
sym(vpx_filter_block1d16_v2_ssse3):
push rbp
mov rbp, rsp
@@ -184,7 +184,7 @@ sym(vpx_filter_block1d16_v2_ssse3):
pop rbp
ret
-global sym(vpx_filter_block1d4_v2_avg_ssse3) PRIVATE
+globalsym(vpx_filter_block1d4_v2_avg_ssse3)
sym(vpx_filter_block1d4_v2_avg_ssse3):
push rbp
mov rbp, rsp
@@ -208,7 +208,7 @@ sym(vpx_filter_block1d4_v2_avg_ssse3):
pop rbp
ret
-global sym(vpx_filter_block1d8_v2_avg_ssse3) PRIVATE
+globalsym(vpx_filter_block1d8_v2_avg_ssse3)
sym(vpx_filter_block1d8_v2_avg_ssse3):
push rbp
mov rbp, rsp
@@ -234,7 +234,7 @@ sym(vpx_filter_block1d8_v2_avg_ssse3):
pop rbp
ret
-global sym(vpx_filter_block1d16_v2_avg_ssse3) PRIVATE
+globalsym(vpx_filter_block1d16_v2_avg_ssse3)
sym(vpx_filter_block1d16_v2_avg_ssse3):
push rbp
mov rbp, rsp
@@ -261,7 +261,7 @@ sym(vpx_filter_block1d16_v2_avg_ssse3):
pop rbp
ret
-global sym(vpx_filter_block1d4_h2_ssse3) PRIVATE
+globalsym(vpx_filter_block1d4_h2_ssse3)
sym(vpx_filter_block1d4_h2_ssse3):
push rbp
mov rbp, rsp
@@ -286,7 +286,7 @@ sym(vpx_filter_block1d4_h2_ssse3):
pop rbp
ret
-global sym(vpx_filter_block1d8_h2_ssse3) PRIVATE
+globalsym(vpx_filter_block1d8_h2_ssse3)
sym(vpx_filter_block1d8_h2_ssse3):
push rbp
mov rbp, rsp
@@ -313,7 +313,7 @@ sym(vpx_filter_block1d8_h2_ssse3):
pop rbp
ret
-global sym(vpx_filter_block1d16_h2_ssse3) PRIVATE
+globalsym(vpx_filter_block1d16_h2_ssse3)
sym(vpx_filter_block1d16_h2_ssse3):
push rbp
mov rbp, rsp
@@ -340,7 +340,7 @@ sym(vpx_filter_block1d16_h2_ssse3):
pop rbp
ret
-global sym(vpx_filter_block1d4_h2_avg_ssse3) PRIVATE
+globalsym(vpx_filter_block1d4_h2_avg_ssse3)
sym(vpx_filter_block1d4_h2_avg_ssse3):
push rbp
mov rbp, rsp
@@ -365,7 +365,7 @@ sym(vpx_filter_block1d4_h2_avg_ssse3):
pop rbp
ret
-global sym(vpx_filter_block1d8_h2_avg_ssse3) PRIVATE
+globalsym(vpx_filter_block1d8_h2_avg_ssse3)
sym(vpx_filter_block1d8_h2_avg_ssse3):
push rbp
mov rbp, rsp
@@ -392,7 +392,7 @@ sym(vpx_filter_block1d8_h2_avg_ssse3):
pop rbp
ret
-global sym(vpx_filter_block1d16_h2_avg_ssse3) PRIVATE
+globalsym(vpx_filter_block1d16_h2_avg_ssse3)
sym(vpx_filter_block1d16_h2_avg_ssse3):
push rbp
mov rbp, rsp
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_ports/asmdefs_mmi.h b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_ports/asmdefs_mmi.h
index 28355bf9f..400a51cc3 100644
--- a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_ports/asmdefs_mmi.h
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_ports/asmdefs_mmi.h
@@ -34,7 +34,7 @@
"ld " #reg ", " #bias "(" #addr ") \n\t"
#define MMI_SRL(reg1, reg2, shift) \
- "dsrl " #reg1 ", " #reg2 ", " #shift " \n\t"
+ "ssrld " #reg1 ", " #reg2 ", " #shift " \n\t"
#define MMI_SLL(reg1, reg2, shift) \
"dsll " #reg1 ", " #reg2 ", " #shift " \n\t"
@@ -63,7 +63,7 @@
"lw " #reg ", " #bias "(" #addr ") \n\t"
#define MMI_SRL(reg1, reg2, shift) \
- "srl " #reg1 ", " #reg2 ", " #shift " \n\t"
+ "ssrlw " #reg1 ", " #reg2 ", " #shift " \n\t"
#define MMI_SLL(reg1, reg2, shift) \
"sll " #reg1 ", " #reg2 ", " #shift " \n\t"
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_ports/emms_mmx.asm b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_ports/emms_mmx.asm
index 9f33590a2..b31b25ebd 100644
--- a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_ports/emms_mmx.asm
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_ports/emms_mmx.asm
@@ -12,7 +12,7 @@
%include "vpx_ports/x86_abi_support.asm"
section .text
-global sym(vpx_clear_system_state) PRIVATE
+globalsym(vpx_clear_system_state)
sym(vpx_clear_system_state):
emms
ret
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_ports/float_control_word.asm b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_ports/float_control_word.asm
index 256dae084..bb75b7a31 100644
--- a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_ports/float_control_word.asm
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_ports/float_control_word.asm
@@ -14,7 +14,7 @@
section .text
%if LIBVPX_YASM_WIN64
-global sym(vpx_winx64_fldcw) PRIVATE
+globalsym(vpx_winx64_fldcw)
sym(vpx_winx64_fldcw):
sub rsp, 8
mov [rsp], rcx ; win x64 specific
@@ -23,7 +23,7 @@ sym(vpx_winx64_fldcw):
ret
-global sym(vpx_winx64_fstcw) PRIVATE
+globalsym(vpx_winx64_fstcw)
sym(vpx_winx64_fstcw):
sub rsp, 8
fstcw [rsp]
diff --git a/TMessagesProj/jni/voip/webrtc/modules/desktop_capture/screen_capturer_null.cc b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_ports/mips.h
similarity index 52%
rename from TMessagesProj/jni/voip/webrtc/modules/desktop_capture/screen_capturer_null.cc
rename to TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_ports/mips.h
index 6b1ccb322..bdc7525f7 100644
--- a/TMessagesProj/jni/voip/webrtc/modules/desktop_capture/screen_capturer_null.cc
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_ports/mips.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ * Copyright (c) 2020 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@@ -8,14 +8,20 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "modules/desktop_capture/desktop_capturer.h"
+#ifndef VPX_PORTS_MIPS_H_
+#define VPX_PORTS_MIPS_H_
-namespace webrtc {
+#ifdef __cplusplus
+extern "C" {
+#endif
-// static
-std::unique_ptr DesktopCapturer::CreateRawScreenCapturer(
- const DesktopCaptureOptions& options) {
- return nullptr;
-}
+#define HAS_MMI 0x01
+#define HAS_MSA 0x02
-} // namespace webrtc
+int mips_cpu_caps(void);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_PORTS_MIPS_H_
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_ports/mips_cpudetect.c b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_ports/mips_cpudetect.c
new file mode 100644
index 000000000..e0eca2d48
--- /dev/null
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_ports/mips_cpudetect.c
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2020 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include
+#include
+#include "./vpx_config.h"
+#include "vpx_ports/mips.h"
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#if defined(__mips__) && defined(__linux__)
+int mips_cpu_caps(void) {
+ char cpuinfo_line[512];
+ int flag = 0x0;
+ FILE *f = fopen("/proc/cpuinfo", "r");
+ if (!f) {
+ // Assume nothing if /proc/cpuinfo is unavailable.
+ // This will occur for Chrome sandbox for Pepper or Render process.
+ return 0;
+ }
+ while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
+ if (memcmp(cpuinfo_line, "cpu model", 9) == 0) {
+ // Workaround early kernel without mmi in ASEs line.
+ if (strstr(cpuinfo_line, "Loongson-3")) {
+ flag |= HAS_MMI;
+ } else if (strstr(cpuinfo_line, "Loongson-2K")) {
+ flag |= HAS_MMI | HAS_MSA;
+ }
+ }
+ if (memcmp(cpuinfo_line, "ASEs implemented", 16) == 0) {
+ if (strstr(cpuinfo_line, "loongson-mmi") &&
+ strstr(cpuinfo_line, "loongson-ext")) {
+ flag |= HAS_MMI;
+ }
+ if (strstr(cpuinfo_line, "msa")) {
+ flag |= HAS_MSA;
+ }
+ // ASEs is the last line, so we can break here.
+ break;
+ }
+ }
+ fclose(f);
+ return flag;
+}
+#else /* end __mips__ && __linux__ */
+#error \
+ "--enable-runtime-cpu-detect selected, but no CPU detection method " \
+"available for your platform. Reconfigure with --disable-runtime-cpu-detect."
+#endif
+#else /* end CONFIG_RUNTIME_CPU_DETECT */
+int mips_cpu_caps(void) { return 0; }
+#endif
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_ports/vpx_ports.mk b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_ports/vpx_ports.mk
index 233177369..e5001be49 100644
--- a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_ports/vpx_ports.mk
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_ports/vpx_ports.mk
@@ -42,6 +42,9 @@ PORTS_SRCS-$(VPX_ARCH_ARM) += arm.h
PORTS_SRCS-$(VPX_ARCH_PPC) += ppc_cpudetect.c
PORTS_SRCS-$(VPX_ARCH_PPC) += ppc.h
+PORTS_SRCS-$(VPX_ARCH_MIPS) += mips_cpudetect.c
+PORTS_SRCS-$(VPX_ARCH_MIPS) += mips.h
+
ifeq ($(VPX_ARCH_MIPS), yes)
PORTS_SRCS-yes += asmdefs_mmi.h
endif
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_ports/x86_abi_support.asm b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_ports/x86_abi_support.asm
index 7e1230ba3..6b2d6b968 100644
--- a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_ports/x86_abi_support.asm
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpx_ports/x86_abi_support.asm
@@ -16,6 +16,17 @@
; In general, we make the source use 64 bit syntax, then twiddle with it using
; the preprocessor to get the 32 bit syntax on 32 bit platforms.
;
+%ifidn __OUTPUT_FORMAT__,elf32
+%define ABI_IS_32BIT 1
+%elifidn __OUTPUT_FORMAT__,macho32
+%define ABI_IS_32BIT 1
+%elifidn __OUTPUT_FORMAT__,win32
+%define ABI_IS_32BIT 1
+%elifidn __OUTPUT_FORMAT__,aout
+%define ABI_IS_32BIT 1
+%else
+%define ABI_IS_32BIT 0
+%endif
%if ABI_IS_32BIT
%define rax eax
@@ -78,34 +89,51 @@
%define LIBVPX_YASM_WIN64 0
%endif
+; Declare groups of platforms
+%ifidn __OUTPUT_FORMAT__,elf32
+ %define LIBVPX_ELF 1
+%elifidn __OUTPUT_FORMAT__,elfx32
+ %define LIBVPX_ELF 1
+%elifidn __OUTPUT_FORMAT__,elf64
+ %define LIBVPX_ELF 1
+%else
+ %define LIBVPX_ELF 0
+%endif
+
+%ifidn __OUTPUT_FORMAT__,macho32
+ %define LIBVPX_MACHO 1
+%elifidn __OUTPUT_FORMAT__,macho64
+ %define LIBVPX_MACHO 1
+%else
+ %define LIBVPX_MACHO 0
+%endif
+
; sym()
; Return the proper symbol name for the target ABI.
;
; Certain ABIs, notably MS COFF and Darwin MACH-O, require that symbols
; with C linkage be prefixed with an underscore.
;
-%ifidn __OUTPUT_FORMAT__,elf32
-%define sym(x) x
-%elifidn __OUTPUT_FORMAT__,elf64
-%define sym(x) x
-%elifidn __OUTPUT_FORMAT__,elfx32
-%define sym(x) x
-%elif LIBVPX_YASM_WIN64
-%define sym(x) x
+%if LIBVPX_ELF || LIBVPX_YASM_WIN64
+ %define sym(x) x
%else
-%define sym(x) _ %+ x
+ ; Mach-O / COFF
+ %define sym(x) _ %+ x
%endif
-; PRIVATE
-; Macro for the attribute to hide a global symbol for the target ABI.
-; This is only active if CHROMIUM is defined.
+; globalsym()
+; Return a global declaration with the proper decoration for the target ABI.
;
-; Chromium doesn't like exported global symbols due to symbol clashing with
-; plugins among other things.
+; When CHROMIUM is defined, include attributes to hide the symbol from the
+; global namespace.
;
-; Requires Chromium's patched copy of yasm:
-; http://src.chromium.org/viewvc/chrome?view=rev&revision=73761
-; http://www.tortall.net/projects/yasm/ticket/236
+; Chromium doesn't like exported global symbols due to symbol clashing with
+; plugins among other things.
+;
+; Requires Chromium's patched copy of yasm:
+; http://src.chromium.org/viewvc/chrome?view=rev&revision=73761
+; http://www.tortall.net/projects/yasm/ticket/236
+; or nasm > 2.14.
;
%ifdef CHROMIUM
%ifdef __NASM_VER__
@@ -115,19 +143,16 @@
%endif
%endif
- %ifidn __OUTPUT_FORMAT__,elf32
- %define PRIVATE :hidden
- %elifidn __OUTPUT_FORMAT__,elf64
- %define PRIVATE :hidden
- %elifidn __OUTPUT_FORMAT__,elfx32
- %define PRIVATE :hidden
- %elif LIBVPX_YASM_WIN64
- %define PRIVATE
+ %if LIBVPX_ELF
+ %define globalsym(x) global sym(x) %+ :function hidden
+ %elif LIBVPX_MACHO
+ %define globalsym(x) global sym(x) %+ :private_extern
%else
- %define PRIVATE :private_extern
+ ; COFF / PE32+
+ %define globalsym(x) global sym(x)
%endif
%else
- %define PRIVATE
+ %define globalsym(x) global sym(x)
%endif
; arg()
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpxenc.c b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpxenc.c
index 50c36bedd..5d7546eb2 100644
--- a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpxenc.c
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpxenc.c
@@ -95,6 +95,8 @@ static const arg_def_t debugmode =
ARG_DEF("D", "debug", 0, "Debug mode (makes output deterministic)");
static const arg_def_t outputfile =
ARG_DEF("o", "output", 1, "Output filename");
+static const arg_def_t use_nv12 =
+ ARG_DEF(NULL, "nv12", 0, "Input file is NV12 ");
static const arg_def_t use_yv12 =
ARG_DEF(NULL, "yv12", 0, "Input file is YV12 ");
static const arg_def_t use_i420 =
@@ -220,7 +222,8 @@ static const arg_def_t error_resilient =
static const arg_def_t lag_in_frames =
ARG_DEF(NULL, "lag-in-frames", 1, "Max number of frames to lag");
-static const arg_def_t *global_args[] = { &use_yv12,
+static const arg_def_t *global_args[] = { &use_nv12,
+ &use_yv12,
&use_i420,
&use_i422,
&use_i444,
@@ -462,6 +465,13 @@ static const arg_def_t target_level = ARG_DEF(
static const arg_def_t row_mt =
ARG_DEF(NULL, "row-mt", 1,
"Enable row based non-deterministic multi-threading in VP9");
+
+static const arg_def_t disable_loopfilter =
+ ARG_DEF(NULL, "disable-loopfilter", 1,
+ "Control Loopfilter in VP9\n"
+ "0: Loopfilter on for all frames (default)\n"
+ "1: Loopfilter off for non reference frames\n"
+ "2: Loopfilter off for all frames");
#endif
#if CONFIG_VP9_ENCODER
@@ -492,6 +502,10 @@ static const arg_def_t *vp9_args[] = { &cpu_used_vp9,
&max_gf_interval,
&target_level,
&row_mt,
+ &disable_loopfilter,
+// NOTE: The entries above have a corresponding entry in vp9_arg_ctrl_map. The
+// entries below do not have a corresponding entry in vp9_arg_ctrl_map. They
+// must be listed at the end of vp9_args.
#if CONFIG_VP9_HIGHBITDEPTH
&bitdeptharg,
&inbitdeptharg,
@@ -524,6 +538,7 @@ static const int vp9_arg_ctrl_map[] = { VP8E_SET_CPUUSED,
VP9E_SET_MAX_GF_INTERVAL,
VP9E_SET_TARGET_LEVEL,
VP9E_SET_ROW_MT,
+ VP9E_SET_DISABLE_LOOPFILTER,
0 };
#endif
@@ -696,6 +711,8 @@ static void parse_global_config(struct VpxEncoderConfig *global, char **argv) {
global->deadline = VPX_DL_REALTIME;
else if (arg_match(&arg, &use_yv12, argi))
global->color_type = YV12;
+ else if (arg_match(&arg, &use_nv12, argi))
+ global->color_type = NV12;
else if (arg_match(&arg, &use_i420, argi))
global->color_type = I420;
else if (arg_match(&arg, &use_i422, argi))
@@ -1642,6 +1659,7 @@ int main(int argc, const char **argv_) {
case I444: input.fmt = VPX_IMG_FMT_I444; break;
case I440: input.fmt = VPX_IMG_FMT_I440; break;
case YV12: input.fmt = VPX_IMG_FMT_YV12; break;
+ case NV12: input.fmt = VPX_IMG_FMT_NV12; break;
}
{
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpxenc.h b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpxenc.h
index b780aedca..be54840f7 100644
--- a/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpxenc.h
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/vpxenc.h
@@ -28,6 +28,7 @@ typedef enum {
I444, // 4:4:4 8+ bit-depth
I440, // 4:4:0 8+ bit-depth
YV12, // 4:2:0 with uv flipped, only 8-bit depth
+ NV12, // 4:2:0 with uv interleaved
} ColorInputType;
struct VpxInterface;
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/webmdec.cc b/TMessagesProj/jni/third_party/libvpx/source/libvpx/webmdec.cc
index d609075a9..68c6f4782 100644
--- a/TMessagesProj/jni/third_party/libvpx/source/libvpx/webmdec.cc
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/webmdec.cc
@@ -19,25 +19,25 @@
namespace {
void reset(struct WebmInputContext *const webm_ctx) {
- if (webm_ctx->reader != NULL) {
+ if (webm_ctx->reader != nullptr) {
mkvparser::MkvReader *const reader =
reinterpret_cast(webm_ctx->reader);
delete reader;
}
- if (webm_ctx->segment != NULL) {
+ if (webm_ctx->segment != nullptr) {
mkvparser::Segment *const segment =
reinterpret_cast(webm_ctx->segment);
delete segment;
}
- if (webm_ctx->buffer != NULL) {
+ if (webm_ctx->buffer != nullptr) {
delete[] webm_ctx->buffer;
}
- webm_ctx->reader = NULL;
- webm_ctx->segment = NULL;
- webm_ctx->buffer = NULL;
- webm_ctx->cluster = NULL;
- webm_ctx->block_entry = NULL;
- webm_ctx->block = NULL;
+ webm_ctx->reader = nullptr;
+ webm_ctx->segment = nullptr;
+ webm_ctx->buffer = nullptr;
+ webm_ctx->cluster = nullptr;
+ webm_ctx->block_entry = nullptr;
+ webm_ctx->block = nullptr;
webm_ctx->block_frame_index = 0;
webm_ctx->video_track_index = 0;
webm_ctx->timestamp_ns = 0;
@@ -84,7 +84,7 @@ int file_is_webm(struct WebmInputContext *webm_ctx,
}
const mkvparser::Tracks *const tracks = segment->GetTracks();
- const mkvparser::VideoTrack *video_track = NULL;
+ const mkvparser::VideoTrack *video_track = nullptr;
for (unsigned long i = 0; i < tracks->GetTracksCount(); ++i) {
const mkvparser::Track *const track = tracks->GetTrackByIndex(i);
if (track->GetType() == mkvparser::Track::kVideo) {
@@ -94,7 +94,7 @@ int file_is_webm(struct WebmInputContext *webm_ctx,
}
}
- if (video_track == NULL || video_track->GetCodecId() == NULL) {
+ if (video_track == nullptr || video_track->GetCodecId() == nullptr) {
rewind_and_reset(webm_ctx, vpx_ctx);
return 0;
}
@@ -137,12 +137,12 @@ int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer,
do {
long status = 0;
bool get_new_block = false;
- if (block_entry == NULL && !block_entry_eos) {
+ if (block_entry == nullptr && !block_entry_eos) {
status = cluster->GetFirst(block_entry);
get_new_block = true;
} else if (block_entry_eos || block_entry->EOS()) {
cluster = segment->GetNext(cluster);
- if (cluster == NULL || cluster->EOS()) {
+ if (cluster == nullptr || cluster->EOS()) {
*buffer_size = 0;
webm_ctx->reached_eos = 1;
return 1;
@@ -150,22 +150,22 @@ int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer,
status = cluster->GetFirst(block_entry);
block_entry_eos = false;
get_new_block = true;
- } else if (block == NULL ||
+ } else if (block == nullptr ||
webm_ctx->block_frame_index == block->GetFrameCount() ||
block->GetTrackNumber() != webm_ctx->video_track_index) {
status = cluster->GetNext(block_entry, block_entry);
- if (block_entry == NULL || block_entry->EOS()) {
+ if (block_entry == nullptr || block_entry->EOS()) {
block_entry_eos = true;
continue;
}
get_new_block = true;
}
- if (status || block_entry == NULL) {
+ if (status || block_entry == nullptr) {
return -1;
}
if (get_new_block) {
block = block_entry->GetBlock();
- if (block == NULL) return -1;
+ if (block == nullptr) return -1;
webm_ctx->block_frame_index = 0;
}
} while (block_entry_eos ||
@@ -181,7 +181,7 @@ int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer,
if (frame.len > static_cast(*buffer_size)) {
delete[] * buffer;
*buffer = new uint8_t[frame.len];
- if (*buffer == NULL) {
+ if (*buffer == nullptr) {
return -1;
}
webm_ctx->buffer = *buffer;
@@ -198,7 +198,7 @@ int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer,
int webm_guess_framerate(struct WebmInputContext *webm_ctx,
struct VpxInputContext *vpx_ctx) {
uint32_t i = 0;
- uint8_t *buffer = NULL;
+ uint8_t *buffer = nullptr;
size_t buffer_size = 0;
while (webm_ctx->timestamp_ns < 1000000000 && i < 50) {
if (webm_read_frame(webm_ctx, &buffer, &buffer_size)) {
@@ -212,8 +212,8 @@ int webm_guess_framerate(struct WebmInputContext *webm_ctx,
delete[] buffer;
get_first_cluster(webm_ctx);
- webm_ctx->block = NULL;
- webm_ctx->block_entry = NULL;
+ webm_ctx->block = nullptr;
+ webm_ctx->block_entry = nullptr;
webm_ctx->block_frame_index = 0;
webm_ctx->timestamp_ns = 0;
webm_ctx->reached_eos = 0;
diff --git a/TMessagesProj/jni/third_party/libvpx/source/libvpx/webmenc.cc b/TMessagesProj/jni/third_party/libvpx/source/libvpx/webmenc.cc
index 66606674b..c718ab5a9 100644
--- a/TMessagesProj/jni/third_party/libvpx/source/libvpx/webmenc.cc
+++ b/TMessagesProj/jni/third_party/libvpx/source/libvpx/webmenc.cc
@@ -90,6 +90,6 @@ void write_webm_file_footer(struct WebmOutputContext *webm_ctx) {
segment->Finalize();
delete segment;
delete writer;
- webm_ctx->writer = NULL;
- webm_ctx->segment = NULL;
+ webm_ctx->writer = nullptr;
+ webm_ctx->segment = nullptr;
}
diff --git a/TMessagesProj/jni/third_party/libyuv/include/libyuv.h b/TMessagesProj/jni/third_party/libyuv/include/libyuv.h
index de652836e..aeffd5ef7 100644
--- a/TMessagesProj/jni/third_party/libyuv/include/libyuv.h
+++ b/TMessagesProj/jni/third_party/libyuv/include/libyuv.h
@@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef INCLUDE_LIBYUV_H_ // NOLINT
+#ifndef INCLUDE_LIBYUV_H_
#define INCLUDE_LIBYUV_H_
#include "libyuv/basic_types.h"
@@ -29,4 +29,4 @@
#include "libyuv/version.h"
#include "libyuv/video_common.h"
-#endif // INCLUDE_LIBYUV_H_ NOLINT
+#endif // INCLUDE_LIBYUV_H_
diff --git a/TMessagesProj/jni/third_party/libyuv/include/libyuv/convert.h b/TMessagesProj/jni/third_party/libyuv/include/libyuv/convert.h
index f571142fa..026b153ce 100644
--- a/TMessagesProj/jni/third_party/libyuv/include/libyuv/convert.h
+++ b/TMessagesProj/jni/third_party/libyuv/include/libyuv/convert.h
@@ -42,6 +42,21 @@ int I444ToI420(const uint8_t* src_y,
int width,
int height);
+// Convert I444 to NV12.
+LIBYUV_API
+int I444ToNV12(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
// Convert I444 to NV21.
LIBYUV_API
int I444ToNV21(const uint8_t* src_y,
@@ -248,19 +263,6 @@ int AYUVToNV21(const uint8_t* src_ayuv,
int width,
int height);
-// Convert M420 to I420.
-LIBYUV_API
-int M420ToI420(const uint8_t* src_m420,
- int src_stride_m420,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height);
-
// Convert Android420 to I420.
LIBYUV_API
int Android420ToI420(const uint8_t* src_y,
@@ -418,7 +420,15 @@ int RGB24ToJ400(const uint8_t* src_rgb24,
int width,
int height);
-#ifdef HAVE_JPEG
+// RGB big endian (rgb in memory) to J400.
+LIBYUV_API
+int RAWToJ400(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_yj,
+ int dst_stride_yj,
+ int width,
+ int height);
+
// src_width/height provided by capture.
// dst_width/height for clipping determine final size.
LIBYUV_API
@@ -448,13 +458,25 @@ int MJPGToNV21(const uint8_t* sample,
int dst_width,
int dst_height);
+// JPEG to NV12
+LIBYUV_API
+int MJPGToNV12(const uint8_t* sample,
+ size_t sample_size,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height);
+
// Query size of MJPG in pixels.
LIBYUV_API
int MJPGSize(const uint8_t* sample,
size_t sample_size,
int* width,
int* height);
-#endif
// Convert camera sample to I420 with cropping, rotation and vertical flip.
// "src_size" is needed to parse MJPG.
diff --git a/TMessagesProj/jni/third_party/libyuv/include/libyuv/convert_argb.h b/TMessagesProj/jni/third_party/libyuv/include/libyuv/convert_argb.h
index bf776348f..715a3dad9 100644
--- a/TMessagesProj/jni/third_party/libyuv/include/libyuv/convert_argb.h
+++ b/TMessagesProj/jni/third_party/libyuv/include/libyuv/convert_argb.h
@@ -15,16 +15,41 @@
#include "libyuv/rotate.h" // For enum RotationMode.
-// TODO(fbarchard): This set of functions should exactly match convert.h
-// TODO(fbarchard): Add tests. Create random content of right size and convert
-// with C vs Opt and or to I420 and compare.
-// TODO(fbarchard): Some of these functions lack parameter setting.
-
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
+// Conversion matrix for YUV to RGB
+LIBYUV_API extern const struct YuvConstants kYuvI601Constants; // BT.601
+LIBYUV_API extern const struct YuvConstants kYuvJPEGConstants; // JPeg
+LIBYUV_API extern const struct YuvConstants kYuvH709Constants; // BT.709
+LIBYUV_API extern const struct YuvConstants kYuv2020Constants; // BT.2020
+
+// Conversion matrix for YVU to BGR
+LIBYUV_API extern const struct YuvConstants kYvuI601Constants; // BT.601
+LIBYUV_API extern const struct YuvConstants kYvuJPEGConstants; // JPeg
+LIBYUV_API extern const struct YuvConstants kYvuH709Constants; // BT.709
+LIBYUV_API extern const struct YuvConstants kYvu2020Constants; // BT.2020
+
+// Macros for end swapped destination Matrix conversions.
+// Swap UV and pass mirrored kYvuJPEGConstants matrix.
+// TODO(fbarchard): Add macro for each Matrix function.
+#define kYuvI601ConstantsVU kYvuI601Constants
+#define kYuvJPEGConstantsVU kYvuJPEGConstants
+#define kYuvH709ConstantsVU kYvuH709Constants
+#define kYuv2020ConstantsVU kYvu2020Constants
+#define NV12ToABGRMatrix(a, b, c, d, e, f, g, h, i) \
+ NV21ToARGBMatrix(a, b, c, d, e, f, g##VU, h, i)
+#define NV21ToABGRMatrix(a, b, c, d, e, f, g, h, i) \
+ NV12ToARGBMatrix(a, b, c, d, e, f, g##VU, h, i)
+#define NV12ToRAWMatrix(a, b, c, d, e, f, g, h, i) \
+ NV21ToRGB24Matrix(a, b, c, d, e, f, g##VU, h, i)
+#define NV21ToRAWMatrix(a, b, c, d, e, f, g, h, i) \
+ NV12ToRGB24Matrix(a, b, c, d, e, f, g##VU, h, i)
+#define I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
+ I420AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n)
+
// Alias.
#define ARGBToARGB ARGBCopy
@@ -657,15 +682,6 @@ int NV21ToRAW(const uint8_t* src_y,
int width,
int height);
-// Convert M420 to ARGB.
-LIBYUV_API
-int M420ToARGB(const uint8_t* src_m420,
- int src_stride_m420,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
// Convert YUY2 to ARGB.
LIBYUV_API
int YUY2ToARGB(const uint8_t* src_yuy2,
@@ -956,7 +972,6 @@ int AR30ToAB30(const uint8_t* src_ar30,
int width,
int height);
-#ifdef HAVE_JPEG
// src_width/height provided by capture
// dst_width/height for clipping determine final size.
LIBYUV_API
@@ -968,7 +983,6 @@ int MJPGToARGB(const uint8_t* sample,
int src_height,
int dst_width,
int dst_height);
-#endif
// Convert Android420 to ARGB.
LIBYUV_API
@@ -998,6 +1012,561 @@ int Android420ToABGR(const uint8_t* src_y,
int width,
int height);
+// Convert NV12 to RGB565.
+LIBYUV_API
+int NV12ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height);
+
+// Convert I422 to BGRA.
+LIBYUV_API
+int I422ToBGRA(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_bgra,
+ int dst_stride_bgra,
+ int width,
+ int height);
+
+// Convert I422 to ABGR.
+LIBYUV_API
+int I422ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert I422 to RGBA.
+LIBYUV_API
+int I422ToRGBA(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToBGRA(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_bgra,
+ int dst_stride_bgra,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToRGBA(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height);
+
+LIBYUV_API
+int H420ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height);
+
+LIBYUV_API
+int H420ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height);
+
+LIBYUV_API
+int J420ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height);
+
+LIBYUV_API
+int J420ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height);
+
+LIBYUV_API
+int J420ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height);
+
+LIBYUV_API
+int H420ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height);
+
+LIBYUV_API
+int I422ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height);
+
+// Convert I420 To RGB565 with 4x4 dither matrix (16 bytes).
+// Values in dither matrix from 0 to 7 recommended.
+// The order of the dither matrix is first byte is upper left.
+
+LIBYUV_API
+int I420ToRGB565Dither(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ const uint8_t* dither4x4,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToARGB1555(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb1555,
+ int dst_stride_argb1555,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToARGB4444(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb4444,
+ int dst_stride_argb4444,
+ int width,
+ int height);
+
+// Convert I420 to AR30.
+LIBYUV_API
+int I420ToAR30(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height);
+
+// Convert H420 to AR30.
+LIBYUV_API
+int H420ToAR30(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height);
+
+// Convert I420 to ARGB with matrix.
+LIBYUV_API
+int I420ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert I422 to ARGB with matrix.
+LIBYUV_API
+int I422ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert I444 to ARGB with matrix.
+LIBYUV_API
+int I444ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// multiply 10 bit yuv into high bits to allow any number of bits.
+LIBYUV_API
+int I010ToAR30Matrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// multiply 10 bit yuv into high bits to allow any number of bits.
+LIBYUV_API
+int I210ToAR30Matrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert 10 bit YUV to ARGB with matrix.
+LIBYUV_API
+int I010ToARGBMatrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert 10 bit 422 YUV to ARGB with matrix.
+LIBYUV_API
+int I210ToARGBMatrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert I420 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I420AlphaToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate);
+
+// Convert NV12 to ARGB with matrix.
+LIBYUV_API
+int NV12ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert NV21 to ARGB with matrix.
+LIBYUV_API
+int NV21ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert NV12 to RGB565 with matrix.
+LIBYUV_API
+int NV12ToRGB565Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert NV12 to RGB24 with matrix.
+LIBYUV_API
+int NV12ToRGB24Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert NV21 to RGB24 with matrix.
+LIBYUV_API
+int NV21ToRGB24Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert Android420 to ARGB with matrix.
+LIBYUV_API
+int Android420ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ int src_pixel_stride_uv,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert I422 to RGBA with matrix.
+LIBYUV_API
+int I422ToRGBAMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert I422 to RGBA with matrix.
+LIBYUV_API
+int I420ToRGBAMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert I420 to RGB24 with matrix.
+LIBYUV_API
+int I420ToRGB24Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert I420 to RGB565 with specified color matrix.
+LIBYUV_API
+int I420ToRGB565Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert I420 to AR30 with matrix.
+LIBYUV_API
+int I420ToAR30Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert I400 (grey) to ARGB. Reverse of ARGBToI400.
+LIBYUV_API
+int I400ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
// Convert camera sample to ARGB with cropping, rotation and vertical flip.
// "sample_size" is needed to parse MJPG.
// "dst_stride_argb" number of bytes in a row of the dst_argb plane.
diff --git a/TMessagesProj/jni/third_party/libyuv/include/libyuv/convert_from.h b/TMessagesProj/jni/third_party/libyuv/include/libyuv/convert_from.h
index afc43939a..5140ed4f3 100644
--- a/TMessagesProj/jni/third_party/libyuv/include/libyuv/convert_from.h
+++ b/TMessagesProj/jni/third_party/libyuv/include/libyuv/convert_from.h
@@ -132,6 +132,10 @@ int I420ToUYVY(const uint8_t* src_y,
int width,
int height);
+// The following are from convert_argb.h
+// DEPRECATED: The prototypes will be removed in future. Use convert_argb.h
+
+// Convert I420 to ARGB.
LIBYUV_API
int I420ToARGB(const uint8_t* src_y,
int src_stride_y,
@@ -144,18 +148,7 @@ int I420ToARGB(const uint8_t* src_y,
int width,
int height);
-LIBYUV_API
-int I420ToBGRA(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_bgra,
- int dst_stride_bgra,
- int width,
- int height);
-
+// Convert I420 to ABGR.
LIBYUV_API
int I420ToABGR(const uint8_t* src_y,
int src_stride_y,
@@ -168,205 +161,6 @@ int I420ToABGR(const uint8_t* src_y,
int width,
int height);
-LIBYUV_API
-int I420ToRGBA(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgba,
- int dst_stride_rgba,
- int width,
- int height);
-
-LIBYUV_API
-int I420ToRGB24(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb24,
- int dst_stride_rgb24,
- int width,
- int height);
-
-LIBYUV_API
-int I420ToRAW(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_raw,
- int dst_stride_raw,
- int width,
- int height);
-
-LIBYUV_API
-int H420ToRGB24(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb24,
- int dst_stride_rgb24,
- int width,
- int height);
-
-LIBYUV_API
-int H420ToRAW(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_raw,
- int dst_stride_raw,
- int width,
- int height);
-
-LIBYUV_API
-int J420ToRGB24(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb24,
- int dst_stride_rgb24,
- int width,
- int height);
-
-LIBYUV_API
-int J420ToRAW(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_raw,
- int dst_stride_raw,
- int width,
- int height);
-
-LIBYUV_API
-int I420ToRGB565(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb565,
- int dst_stride_rgb565,
- int width,
- int height);
-
-LIBYUV_API
-int J420ToRGB565(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb565,
- int dst_stride_rgb565,
- int width,
- int height);
-
-LIBYUV_API
-int H420ToRGB565(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb565,
- int dst_stride_rgb565,
- int width,
- int height);
-
-LIBYUV_API
-int I422ToRGB565(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb565,
- int dst_stride_rgb565,
- int width,
- int height);
-
-// Convert I420 To RGB565 with 4x4 dither matrix (16 bytes).
-// Values in dither matrix from 0 to 7 recommended.
-// The order of the dither matrix is first byte is upper left.
-
-LIBYUV_API
-int I420ToRGB565Dither(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb565,
- int dst_stride_rgb565,
- const uint8_t* dither4x4,
- int width,
- int height);
-
-LIBYUV_API
-int I420ToARGB1555(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb1555,
- int dst_stride_argb1555,
- int width,
- int height);
-
-LIBYUV_API
-int I420ToARGB4444(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb4444,
- int dst_stride_argb4444,
- int width,
- int height);
-
-// Convert I420 to AR30.
-LIBYUV_API
-int I420ToAR30(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_ar30,
- int dst_stride_ar30,
- int width,
- int height);
-
-// Convert H420 to AR30.
-LIBYUV_API
-int H420ToAR30(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_ar30,
- int dst_stride_ar30,
- int width,
- int height);
-
// Convert I420 to specified format.
// "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the
// buffer has contiguous rows. Can be negative. A multiple of 16 is optimal.
diff --git a/TMessagesProj/jni/third_party/libyuv/include/libyuv/convert_from_argb.h b/TMessagesProj/jni/third_party/libyuv/include/libyuv/convert_from_argb.h
index 057182448..d992363ce 100644
--- a/TMessagesProj/jni/third_party/libyuv/include/libyuv/convert_from_argb.h
+++ b/TMessagesProj/jni/third_party/libyuv/include/libyuv/convert_from_argb.h
@@ -77,6 +77,10 @@ int ARGBToAR30(const uint8_t* src_argb,
int width,
int height);
+// Aliases
+#define ABGRToRGB24 ARGBToRAW
+#define ABGRToRAW ARGBToRGB24
+
// Convert ARGB To RGB24.
LIBYUV_API
int ARGBToRGB24(const uint8_t* src_argb,
@@ -281,17 +285,6 @@ int ABGRToNV21(const uint8_t* src_abgr,
int width,
int height);
-// Convert ARGB To NV21.
-LIBYUV_API
-int ARGBToNV21(const uint8_t* src_argb,
- int src_stride_argb,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_vu,
- int dst_stride_vu,
- int width,
- int height);
-
// Convert ARGB To YUY2.
LIBYUV_API
int ARGBToYUY2(const uint8_t* src_argb,
diff --git a/TMessagesProj/jni/third_party/libyuv/include/libyuv/cpu_id.h b/TMessagesProj/jni/third_party/libyuv/include/libyuv/cpu_id.h
index b01cd25c5..3e27cc107 100644
--- a/TMessagesProj/jni/third_party/libyuv/include/libyuv/cpu_id.h
+++ b/TMessagesProj/jni/third_party/libyuv/include/libyuv/cpu_id.h
@@ -71,6 +71,8 @@ static __inline int TestCpuFlag(int test_flag) {
// Internal function for parsing /proc/cpuinfo.
LIBYUV_API
int ArmCpuCaps(const char* cpuinfo_name);
+LIBYUV_API
+int MipsCpuCaps(const char* cpuinfo_name);
// For testing, allow CPU flags to be disabled.
// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
diff --git a/TMessagesProj/jni/third_party/libyuv/include/libyuv/macros_msa.h b/TMessagesProj/jni/third_party/libyuv/include/libyuv/macros_msa.h
index 29997ce11..4e232b66b 100644
--- a/TMessagesProj/jni/third_party/libyuv/include/libyuv/macros_msa.h
+++ b/TMessagesProj/jni/third_party/libyuv/include/libyuv/macros_msa.h
@@ -140,6 +140,9 @@
#define LD_B(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */
#define LD_UB(...) LD_B(const v16u8, __VA_ARGS__)
+#define LD_H(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */
+#define LD_UH(...) LD_H(const v8u16, __VA_ARGS__)
+
#define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */
#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
diff --git a/TMessagesProj/jni/third_party/libyuv/include/libyuv/planar_functions.h b/TMessagesProj/jni/third_party/libyuv/include/libyuv/planar_functions.h
index 5299fe2c0..9e0038f47 100644
--- a/TMessagesProj/jni/third_party/libyuv/include/libyuv/planar_functions.h
+++ b/TMessagesProj/jni/third_party/libyuv/include/libyuv/planar_functions.h
@@ -105,6 +105,19 @@ void MergeUVPlane(const uint8_t* src_u,
int width,
int height);
+// Scale U and V to half width and height and merge into interleaved UV plane.
+// width and height are source size, allowing odd sizes.
+// Use for converting I444 or I422 to NV12.
+LIBYUV_API
+void HalfMergeUVPlane(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
// Swap U and V channels in interleaved UV plane.
LIBYUV_API
void SwapUVPlane(const uint8_t* src_uv,
@@ -301,6 +314,22 @@ int I400Mirror(const uint8_t* src_y,
int width,
int height);
+// Alias
+#define NV12ToNV12Mirror NV12Mirror
+
+// NV12 mirror.
+LIBYUV_API
+int NV12Mirror(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
// Alias
#define ARGBToARGBMirror ARGBMirror
@@ -313,56 +342,35 @@ int ARGBMirror(const uint8_t* src_argb,
int width,
int height);
-// Convert NV12 to RGB565.
+// Alias
+#define RGB24ToRGB24Mirror RGB24Mirror
+
+// RGB24 mirror.
LIBYUV_API
-int NV12ToRGB565(const uint8_t* src_y,
+int RGB24Mirror(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height);
+
+// Mirror a plane of data.
+LIBYUV_API
+void MirrorPlane(const uint8_t* src_y,
int src_stride_y,
- const uint8_t* src_uv,
- int src_stride_uv,
- uint8_t* dst_rgb565,
- int dst_stride_rgb565,
+ uint8_t* dst_y,
+ int dst_stride_y,
int width,
int height);
-// I422ToARGB is in convert_argb.h
-// Convert I422 to BGRA.
+// Mirror a plane of UV data.
LIBYUV_API
-int I422ToBGRA(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_bgra,
- int dst_stride_bgra,
- int width,
- int height);
-
-// Convert I422 to ABGR.
-LIBYUV_API
-int I422ToABGR(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_abgr,
- int dst_stride_abgr,
- int width,
- int height);
-
-// Convert I422 to RGBA.
-LIBYUV_API
-int I422ToRGBA(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgba,
- int dst_stride_rgba,
- int width,
- int height);
+void MirrorUVPlane(const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
// Alias
#define RGB24ToRAW RAWToRGB24
@@ -743,6 +751,19 @@ int ARGBBlur(const uint8_t* src_argb,
int height,
int radius);
+// Gaussian 5x5 blur a float plane.
+// Coefficients of 1, 4, 6, 4, 1.
+// Each destination pixel is a blur of the 5x5
+// pixels from the source.
+// Source edges are clamped.
+LIBYUV_API
+int GaussPlane_F32(const float* src,
+ int src_stride,
+ float* dst,
+ int dst_stride,
+ int width,
+ int height);
+
// Multiply ARGB image by ARGB value.
LIBYUV_API
int ARGBShade(const uint8_t* src_argb,
diff --git a/TMessagesProj/jni/third_party/libyuv/include/libyuv/rotate.h b/TMessagesProj/jni/third_party/libyuv/include/libyuv/rotate.h
index c64e0216d..308882242 100644
--- a/TMessagesProj/jni/third_party/libyuv/include/libyuv/rotate.h
+++ b/TMessagesProj/jni/third_party/libyuv/include/libyuv/rotate.h
@@ -118,6 +118,10 @@ void RotatePlane270(const uint8_t* src,
int width,
int height);
+// Rotations for when U and V are interleaved.
+// These functions take one input pointer and
+// split the data into two buffers while
+// rotating them. Deprecated.
LIBYUV_API
void RotateUV90(const uint8_t* src,
int src_stride,
@@ -128,10 +132,6 @@ void RotateUV90(const uint8_t* src,
int width,
int height);
-// Rotations for when U and V are interleaved.
-// These functions take one input pointer and
-// split the data into two buffers while
-// rotating them. Deprecated.
LIBYUV_API
void RotateUV180(const uint8_t* src,
int src_stride,
diff --git a/TMessagesProj/jni/third_party/libyuv/include/libyuv/row.h b/TMessagesProj/jni/third_party/libyuv/include/libyuv/row.h
index b721858f1..a27788c1f 100644
--- a/TMessagesProj/jni/third_party/libyuv/include/libyuv/row.h
+++ b/TMessagesProj/jni/third_party/libyuv/include/libyuv/row.h
@@ -98,7 +98,6 @@ extern "C" {
#define HAS_COPYROW_SSE2
#define HAS_H422TOARGBROW_SSSE3
#define HAS_HALFFLOATROW_SSE2
-#define HAS_I400TOARGBROW_SSE2
#define HAS_I422TOARGB1555ROW_SSSE3
#define HAS_I422TOARGB4444ROW_SSSE3
#define HAS_I422TOARGBROW_SSSE3
@@ -112,7 +111,7 @@ extern "C" {
#define HAS_J422TOARGBROW_SSSE3
#define HAS_MERGEUVROW_SSE2
#define HAS_MIRRORROW_SSSE3
-#define HAS_MIRRORUVROW_SSSE3
+#define HAS_MIRRORSPLITUVROW_SSSE3
#define HAS_NV12TOARGBROW_SSSE3
#define HAS_NV12TORGB24ROW_SSSE3
#define HAS_NV12TORGB565ROW_SSSE3
@@ -123,6 +122,8 @@ extern "C" {
#define HAS_RAWTOYROW_SSSE3
#define HAS_RGB24TOARGBROW_SSSE3
#define HAS_RGB24TOYROW_SSSE3
+#define HAS_RGB24TOYJROW_SSSE3
+#define HAS_RAWTOYJROW_SSSE3
#define HAS_RGB565TOARGBROW_SSE2
#define HAS_RGBATOUVROW_SSSE3
#define HAS_RGBATOYROW_SSSE3
@@ -194,11 +195,12 @@ extern "C" {
#define HAS_ARGBTOUVROW_AVX2
#define HAS_ARGBTOYJROW_AVX2
#define HAS_ARGBTOYROW_AVX2
+#define HAS_RGB24TOYJROW_AVX2
+#define HAS_RAWTOYJROW_AVX2
#define HAS_COPYROW_AVX
#define HAS_H422TOARGBROW_AVX2
#define HAS_HALFFLOATROW_AVX2
// #define HAS_HALFFLOATROW_F16C // Enable to test halffloat cast
-#define HAS_I400TOARGBROW_AVX2
#define HAS_I422TOARGB1555ROW_AVX2
#define HAS_I422TOARGB4444ROW_AVX2
#define HAS_I422TOARGBROW_AVX2
@@ -269,12 +271,16 @@ extern "C" {
#define HAS_ARGBTOAR30ROW_SSSE3
#define HAS_CONVERT16TO8ROW_SSSE3
#define HAS_CONVERT8TO16ROW_SSE2
-// I210 is for H010. 2 = 422. I for 601 vs H for 709.
+#define HAS_HALFMERGEUVROW_SSSE3
#define HAS_I210TOAR30ROW_SSSE3
#define HAS_I210TOARGBROW_SSSE3
+#define HAS_I400TOARGBROW_SSE2
#define HAS_I422TOAR30ROW_SSSE3
#define HAS_MERGERGBROW_SSSE3
+#define HAS_MIRRORUVROW_AVX2
+#define HAS_MIRRORUVROW_SSSE3
#define HAS_RAWTORGBAROW_SSSE3
+#define HAS_RGB24MIRRORROW_SSSE3
#define HAS_RGBATOYJROW_SSSE3
#define HAS_SPLITRGBROW_SSSE3
#define HAS_SWAPUVROW_SSSE3
@@ -293,8 +299,10 @@ extern "C" {
#define HAS_ARGBTORGB24ROW_AVX2
#define HAS_CONVERT16TO8ROW_AVX2
#define HAS_CONVERT8TO16ROW_AVX2
+#define HAS_HALFMERGEUVROW_AVX2
#define HAS_I210TOAR30ROW_AVX2
#define HAS_I210TOARGBROW_AVX2
+#define HAS_I400TOARGBROW_AVX2
#define HAS_I422TOAR30ROW_AVX2
#define HAS_I422TOUYVYROW_AVX2
#define HAS_I422TOYUY2ROW_AVX2
@@ -338,7 +346,6 @@ extern "C" {
#define HAS_ARGBTOUVJROW_NEON
#define HAS_ARGBTOUVROW_NEON
#define HAS_ARGBTOYJROW_NEON
-#define HAS_RGBATOYJROW_NEON
#define HAS_ARGBTOYROW_NEON
#define HAS_AYUVTOUVROW_NEON
#define HAS_AYUVTOVUROW_NEON
@@ -348,6 +355,7 @@ extern "C" {
#define HAS_BYTETOFLOATROW_NEON
#define HAS_COPYROW_NEON
#define HAS_HALFFLOATROW_NEON
+#define HAS_HALFMERGEUVROW_NEON
#define HAS_I400TOARGBROW_NEON
#define HAS_I422ALPHATOARGBROW_NEON
#define HAS_I422TOARGB1555ROW_NEON
@@ -363,6 +371,7 @@ extern "C" {
#define HAS_MERGEUVROW_NEON
#define HAS_MIRRORROW_NEON
#define HAS_MIRRORUVROW_NEON
+#define HAS_MIRRORSPLITUVROW_NEON
#define HAS_NV12TOARGBROW_NEON
#define HAS_NV12TORGB24ROW_NEON
#define HAS_NV12TORGB565ROW_NEON
@@ -370,17 +379,20 @@ extern "C" {
#define HAS_NV21TORGB24ROW_NEON
#define HAS_NV21TOYUV24ROW_NEON
#define HAS_RAWTOARGBROW_NEON
-#define HAS_RAWTORGBAROW_NEON
#define HAS_RAWTORGB24ROW_NEON
+#define HAS_RAWTORGBAROW_NEON
#define HAS_RAWTOUVROW_NEON
+#define HAS_RAWTOYJROW_NEON
#define HAS_RAWTOYROW_NEON
#define HAS_RGB24TOARGBROW_NEON
#define HAS_RGB24TOUVROW_NEON
+#define HAS_RGB24TOYJROW_NEON
#define HAS_RGB24TOYROW_NEON
#define HAS_RGB565TOARGBROW_NEON
#define HAS_RGB565TOUVROW_NEON
#define HAS_RGB565TOYROW_NEON
#define HAS_RGBATOUVROW_NEON
+#define HAS_RGBATOYJROW_NEON
#define HAS_RGBATOYROW_NEON
#define HAS_SETROW_NEON
#define HAS_SPLITRGBROW_NEON
@@ -402,6 +414,7 @@ extern "C" {
#define HAS_ARGBCOLORMATRIXROW_NEON
#define HAS_ARGBGRAYROW_NEON
#define HAS_ARGBMIRRORROW_NEON
+#define HAS_RGB24MIRRORROW_NEON
#define HAS_ARGBMULTIPLYROW_NEON
#define HAS_ARGBQUANTIZEROW_NEON
#define HAS_ARGBSEPIAROW_NEON
@@ -419,6 +432,9 @@ extern "C" {
// The following are available on AArch64 platforms:
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#define HAS_SCALESUMSAMPLES_NEON
+#define HAS_GAUSSROW_F32_NEON
+#define HAS_GAUSSCOL_F32_NEON
+
#endif
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
#define HAS_ABGRTOUVROW_MSA
@@ -470,6 +486,7 @@ extern "C" {
#define HAS_MERGEUVROW_MSA
#define HAS_MIRRORROW_MSA
#define HAS_MIRRORUVROW_MSA
+#define HAS_MIRRORSPLITUVROW_MSA
#define HAS_NV12TOARGBROW_MSA
#define HAS_NV12TORGB565ROW_MSA
#define HAS_NV21TOARGBROW_MSA
@@ -552,7 +569,7 @@ extern "C" {
#define HAS_MERGERGBROW_MMI
#define HAS_MERGEUVROW_MMI
#define HAS_MIRRORROW_MMI
-#define HAS_MIRRORUVROW_MMI
+#define HAS_MIRRORSPLITUVROW_MMI
#define HAS_RAWTOARGBROW_MMI
#define HAS_RAWTORGB24ROW_MMI
#define HAS_RAWTOUVROW_MMI
@@ -601,6 +618,7 @@ extern "C" {
#endif
typedef __declspec(align(16)) int16_t vec16[8];
typedef __declspec(align(16)) int32_t vec32[4];
+typedef __declspec(align(16)) float vecf32[4];
typedef __declspec(align(16)) int8_t vec8[16];
typedef __declspec(align(16)) uint16_t uvec16[8];
typedef __declspec(align(16)) uint32_t uvec32[4];
@@ -620,6 +638,7 @@ typedef __declspec(align(32)) uint8_t ulvec8[32];
#endif
typedef int16_t __attribute__((vector_size(16))) vec16;
typedef int32_t __attribute__((vector_size(16))) vec32;
+typedef float __attribute__((vector_size(16))) vecf32;
typedef int8_t __attribute__((vector_size(16))) vec8;
typedef uint16_t __attribute__((vector_size(16))) uvec16;
typedef uint32_t __attribute__((vector_size(16))) uvec32;
@@ -634,6 +653,7 @@ typedef uint8_t __attribute__((vector_size(32))) ulvec8;
#define SIMD_ALIGNED(var) var
typedef int16_t vec16[8];
typedef int32_t vec32[4];
+typedef float vecf32[4];
typedef int8_t vec8[16];
typedef uint16_t uvec16[8];
typedef uint32_t uvec32[4];
@@ -674,6 +694,7 @@ struct YuvConstants {
int16_t kUVBiasG[16];
int16_t kUVBiasR[16];
int16_t kYToRgb[16];
+ int16_t kYBiasToRgb[16];
};
// Offsets into YuvConstants structure
@@ -684,20 +705,10 @@ struct YuvConstants {
#define KUVBIASG 128
#define KUVBIASR 160
#define KYTORGB 192
+#define KYBIASTORGB 224
+
#endif
-// Conversion matrix for YUV to RGB
-extern const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants); // BT.601
-extern const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants); // JPeg
-extern const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants); // BT.709
-extern const struct YuvConstants SIMD_ALIGNED(kYuv2020Constants); // BT.2020
-
-// Conversion matrix for YVU to BGR
-extern const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants); // BT.601
-extern const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants); // JPeg
-extern const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants); // BT.709
-extern const struct YuvConstants SIMD_ALIGNED(kYvu2020Constants); // BT.2020
-
#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1)))
#define align_buffer_64(var, size) \
@@ -965,7 +976,11 @@ void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width);
void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width);
void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width);
void RGB24ToYRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
void RAWToYRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_y, int width);
void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width);
void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width);
@@ -1134,7 +1149,9 @@ void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width);
void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width);
void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width);
void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width);
void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_y,
@@ -1165,7 +1182,9 @@ void BGRAToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ABGRToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
void RGBAToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
void RGB24ToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGB24ToYJRow_C(const uint8_t* src_argb, uint8_t* dst_yj, int width);
void RAWToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RAWToYJRow_C(const uint8_t* src_argb, uint8_t* dst_yj, int width);
void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width);
@@ -1175,8 +1194,14 @@ void RGBAToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void BGRAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ABGRToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGBAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RGB24ToYRow_Any_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
-void RAWToYRow_Any_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RGB24ToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYJRow_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void RAWToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ARGBToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ARGBToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGBAToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
@@ -1184,7 +1209,9 @@ void BGRAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ABGRToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGBAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGB24ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RAWToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGB565ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ARGB1555ToYRow_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
@@ -1542,27 +1569,36 @@ void MirrorRow_Any_SSE2(const uint8_t* src, uint8_t* dst, int width);
void MirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void MirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void MirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_MSA(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void MirrorUVRow_SSSE3(const uint8_t* src,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width);
-void MirrorUVRow_NEON(const uint8_t* src_uv,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width);
-void MirrorUVRow_MSA(const uint8_t* src_uv,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width);
-void MirrorUVRow_MMI(const uint8_t* src_uv,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width);
-void MirrorUVRow_C(const uint8_t* src_uv,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width);
+void MirrorSplitUVRow_SSSE3(const uint8_t* src,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void MirrorSplitUVRow_MSA(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void MirrorSplitUVRow_MMI(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void MirrorSplitUVRow_C(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
@@ -1582,6 +1618,16 @@ void ARGBMirrorRow_Any_NEON(const uint8_t* src_ptr,
void ARGBMirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ARGBMirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width);
+void RGB24MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width);
+void RGB24MirrorRow_C(const uint8_t* src, uint8_t* dst, int width);
+void RGB24MirrorRow_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void RGB24MirrorRow_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+
void SplitUVRow_C(const uint8_t* src_uv,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -1672,6 +1718,34 @@ void MergeUVRow_Any_MMI(const uint8_t* y_buf,
uint8_t* dst_ptr,
int width);
+void HalfMergeUVRow_C(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width);
+
+void HalfMergeUVRow_NEON(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width);
+
+void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width);
+
+void HalfMergeUVRow_AVX2(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width);
+
void SplitRGBRow_C(const uint8_t* src_rgb,
uint8_t* dst_r,
uint8_t* dst_g,
@@ -2728,23 +2802,50 @@ void I422ToRGB24Row_Any_AVX2(const uint8_t* y_buf,
const struct YuvConstants* yuvconstants,
int width);
-void I400ToARGBRow_C(const uint8_t* src_y, uint8_t* rgb_buf, int width);
-void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width);
-void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width);
-void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width);
-void I400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width);
-void I400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void I400ToARGBRow_C(const uint8_t* src_y,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I400ToARGBRow_SSE2(const uint8_t* y_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I400ToARGBRow_AVX2(const uint8_t* y_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I400ToARGBRow_NEON(const uint8_t* src_y,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I400ToARGBRow_MSA(const uint8_t* src_y,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I400ToARGBRow_MMI(const uint8_t* src_y,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I400ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
int width);
void I400ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
int width);
void I400ToARGBRow_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
int width);
-void I400ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void I400ToARGBRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void I400ToARGBRow_Any_MSA(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I400ToARGBRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
// ARGB preattenuated alpha blend.
void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
@@ -4256,6 +4357,25 @@ void UYVYToARGBRow_Any_MMI(const uint8_t* src_ptr,
const struct YuvConstants* yuvconstants,
int width);
+void GaussRow_F32_NEON(const float* src, float* dst, int width);
+void GaussRow_F32_C(const float* src, float* dst, int width);
+
+void GaussCol_F32_NEON(const float* src0,
+ const float* src1,
+ const float* src2,
+ const float* src3,
+ const float* src4,
+ float* dst,
+ int width);
+
+void GaussCol_F32_C(const float* src0,
+ const float* src1,
+ const float* src2,
+ const float* src3,
+ const float* src4,
+ float* dst,
+ int width);
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/TMessagesProj/jni/third_party/libyuv/include/libyuv/scale.h b/TMessagesProj/jni/third_party/libyuv/include/libyuv/scale.h
index 23ba1634f..add5a9eb6 100644
--- a/TMessagesProj/jni/third_party/libyuv/include/libyuv/scale.h
+++ b/TMessagesProj/jni/third_party/libyuv/include/libyuv/scale.h
@@ -145,6 +145,31 @@ int I444Scale_16(const uint16_t* src_y,
int dst_height,
enum FilterMode filtering);
+// Scales an NV12 image from the src width and height to the
+// dst width and height.
+// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
+// used. This produces basic (blocky) quality at the fastest speed.
+// If filtering is kFilterBilinear, interpolation is used to produce a better
+// quality image, at the expense of speed.
+// kFilterBox is not supported for the UV channel and will be treated as
+// bilinear.
+// Returns 0 if successful.
+
+LIBYUV_API
+int NV12Scale(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ int src_width,
+ int src_height,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering);
+
#ifdef __cplusplus
// Legacy API. Deprecated.
LIBYUV_API
diff --git a/TMessagesProj/jni/third_party/libyuv/include/libyuv/scale_row.h b/TMessagesProj/jni/third_party/libyuv/include/libyuv/scale_row.h
index dd20718a8..a386d4998 100644
--- a/TMessagesProj/jni/third_party/libyuv/include/libyuv/scale_row.h
+++ b/TMessagesProj/jni/third_party/libyuv/include/libyuv/scale_row.h
@@ -72,6 +72,22 @@ extern "C" {
#define HAS_SCALEROWDOWN4_SSSE3
#endif
+// The following are available for gcc/clang x86 platforms:
+// TODO(fbarchard): Port to Visual C
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+#define HAS_SCALEUVROWDOWN2BOX_SSSE3
+#endif
+
+// The following are available for gcc/clang x86 platforms, but
+// require clang 3.4 or gcc 4.7.
+// TODO(fbarchard): Port to Visual C
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(__x86_64__) || defined(__i386__)) && !defined(_MSC_VER) && \
+ (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
+#define HAS_SCALEUVROWDOWN2BOX_AVX2
+#endif
+
// The following are available on all x86 platforms, but
// require VS2012, clang 3.4 or gcc 4.7.
// The code supports NaCL but requires a new compiler and validator.
@@ -96,6 +112,8 @@ extern "C" {
#define HAS_SCALEROWDOWN34_NEON
#define HAS_SCALEROWDOWN38_NEON
#define HAS_SCALEROWDOWN4_NEON
+#define HAS_SCALEUVROWDOWN2BOX_NEON
+#define HAS_SCALEUVROWDOWNEVEN_NEON
#endif
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
@@ -376,6 +394,53 @@ void ScaleARGBFilterCols64_C(uint8_t* dst_argb,
int dst_width,
int x32,
int dx);
+void ScaleUVRowDown2_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2Linear_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2Box_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDownEven_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDownEvenBox_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVCols_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleUVCols64_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int x32,
+ int dx);
+void ScaleUVColsUp2_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int,
+ int);
+void ScaleUVFilterCols_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleUVFilterCols64_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int x32,
+ int dx);
// Specialized scalers for x86.
void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
@@ -782,6 +847,192 @@ void ScaleARGBRowDownEvenBox_Any_MMI(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width);
+// UV Row functions
+void ScaleUVRowDown2_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2Linear_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleUVRowDown2_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2Linear_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2Box_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2Linear_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2Box_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2_Any_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2Linear_Any_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2Box_Any_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2Box_Any_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2_Any_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2Linear_Any_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2Box_Any_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2Linear_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2Box_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2Linear_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2Box_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDownEven_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDownEvenBox_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDownEvenBox_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDownEven_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int32_t src_stepx,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDownEvenBox_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDownEven_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int32_t src_stepx,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDownEvenBox_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDownEven_Any_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDownEvenBox_Any_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDownEven_Any_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDownEvenBox_Any_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDownEven_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int32_t src_stepx,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDownEvenBox_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDownEven_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int32_t src_stepx,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDownEvenBox_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_ptr,
+ int dst_width);
+
// ScaleRowDown2Box also used by planar functions
// NEON downscalers with interpolation.
diff --git a/TMessagesProj/jni/third_party/libyuv/include/libyuv/scale_uv.h b/TMessagesProj/jni/third_party/libyuv/include/libyuv/scale_uv.h
new file mode 100644
index 000000000..1b6327aae
--- /dev/null
+++ b/TMessagesProj/jni/third_party/libyuv/include/libyuv/scale_uv.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2020 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_UV_H_
+#define INCLUDE_LIBYUV_SCALE_UV_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/scale.h" // For FilterMode
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+LIBYUV_API
+int UVScale(const uint8_t* src_uv,
+ int src_stride_uv,
+ int src_width,
+ int src_height,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_SCALE_UV_H_
diff --git a/TMessagesProj/jni/third_party/libyuv/include/libyuv/version.h b/TMessagesProj/jni/third_party/libyuv/include/libyuv/version.h
index 4c446ba3d..1d085960e 100644
--- a/TMessagesProj/jni/third_party/libyuv/include/libyuv/version.h
+++ b/TMessagesProj/jni/third_party/libyuv/include/libyuv/version.h
@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1741
+#define LIBYUV_VERSION 1767
#endif // INCLUDE_LIBYUV_VERSION_H_
diff --git a/TMessagesProj/jni/third_party/libyuv/include/libyuv/video_common.h b/TMessagesProj/jni/third_party/libyuv/include/libyuv/video_common.h
index 666eb3439..b9823d71d 100644
--- a/TMessagesProj/jni/third_party/libyuv/include/libyuv/video_common.h
+++ b/TMessagesProj/jni/third_party/libyuv/include/libyuv/video_common.h
@@ -62,7 +62,7 @@ enum FourCC {
FOURCC_I010 = FOURCC('I', '0', '1', '0'), // bt.601 10 bit 420
FOURCC_I210 = FOURCC('I', '0', '1', '0'), // bt.601 10 bit 422
- // 1 Secondary YUV format: row biplanar.
+ // 1 Secondary YUV format: row biplanar. deprecated.
FOURCC_M420 = FOURCC('M', '4', '2', '0'),
// 11 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc
@@ -86,10 +86,14 @@ enum FourCC {
FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'),
FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'), // Linux version of I420.
- FOURCC_J420 = FOURCC('J', '4', '2', '0'), // jpeg (bt.601 full), unofficial fourcc
- FOURCC_J422 = FOURCC('J', '4', '2', '2'), // jpeg (bt.601 full), unofficial fourcc
- FOURCC_J444 = FOURCC('J', '4', '4', '4'), // jpeg (bt.601 full), unofficial fourcc
- FOURCC_J400 = FOURCC('J', '4', '0', '0'), // jpeg (bt.601 full), unofficial fourcc
+ FOURCC_J420 =
+ FOURCC('J', '4', '2', '0'), // jpeg (bt.601 full), unofficial fourcc
+ FOURCC_J422 =
+ FOURCC('J', '4', '2', '2'), // jpeg (bt.601 full), unofficial fourcc
+ FOURCC_J444 =
+ FOURCC('J', '4', '4', '4'), // jpeg (bt.601 full), unofficial fourcc
+ FOURCC_J400 =
+ FOURCC('J', '4', '0', '0'), // jpeg (bt.601 full), unofficial fourcc
FOURCC_H420 = FOURCC('H', '4', '2', '0'), // bt.709, unofficial fourcc
FOURCC_H422 = FOURCC('H', '4', '2', '2'), // bt.709, unofficial fourcc
FOURCC_H444 = FOURCC('H', '4', '4', '4'), // bt.709, unofficial fourcc
@@ -144,7 +148,7 @@ enum FourCCBpp {
FOURCC_BPP_NV12 = 12,
FOURCC_BPP_YUY2 = 16,
FOURCC_BPP_UYVY = 16,
- FOURCC_BPP_M420 = 12,
+ FOURCC_BPP_M420 = 12, // deprecated
FOURCC_BPP_Q420 = 12,
FOURCC_BPP_ARGB = 32,
FOURCC_BPP_BGRA = 32,
diff --git a/TMessagesProj/jni/third_party/libyuv/source/compare.cc b/TMessagesProj/jni/third_party/libyuv/source/compare.cc
index 7f4828104..e93aba1b5 100644
--- a/TMessagesProj/jni/third_party/libyuv/source/compare.cc
+++ b/TMessagesProj/jni/third_party/libyuv/source/compare.cc
@@ -149,16 +149,16 @@ uint64_t ComputeHammingDistance(const uint8_t* src_a,
HammingDistance = HammingDistance_AVX2;
}
#endif
-#if defined(HAS_HAMMINGDISTANCE_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- HammingDistance = HammingDistance_MSA;
- }
-#endif
#if defined(HAS_HAMMINGDISTANCE_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
HammingDistance = HammingDistance_MMI;
}
#endif
+#if defined(HAS_HAMMINGDISTANCE_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ HammingDistance = HammingDistance_MSA;
+ }
+#endif
#ifdef _OPENMP
#pragma omp parallel for reduction(+ : diff)
@@ -211,16 +211,16 @@ uint64_t ComputeSumSquareError(const uint8_t* src_a,
SumSquareError = SumSquareError_AVX2;
}
#endif
-#if defined(HAS_SUMSQUAREERROR_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- SumSquareError = SumSquareError_MSA;
- }
-#endif
#if defined(HAS_SUMSQUAREERROR_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
SumSquareError = SumSquareError_MMI;
}
#endif
+#if defined(HAS_SUMSQUAREERROR_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ SumSquareError = SumSquareError_MSA;
+ }
+#endif
#ifdef _OPENMP
#pragma omp parallel for reduction(+ : sse)
#endif
diff --git a/TMessagesProj/jni/third_party/libyuv/source/compare_gcc.cc b/TMessagesProj/jni/third_party/libyuv/source/compare_gcc.cc
index 676527c1b..6700f9697 100644
--- a/TMessagesProj/jni/third_party/libyuv/source/compare_gcc.cc
+++ b/TMessagesProj/jni/third_party/libyuv/source/compare_gcc.cc
@@ -29,38 +29,38 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
uint64_t diff = 0u;
asm volatile(
- "xor %3,%3 \n"
- "xor %%r8,%%r8 \n"
- "xor %%r9,%%r9 \n"
- "xor %%r10,%%r10 \n"
+ "xor %3,%3 \n"
+ "xor %%r8,%%r8 \n"
+ "xor %%r9,%%r9 \n"
+ "xor %%r10,%%r10 \n"
// Process 32 bytes per loop.
LABELALIGN
"1: \n"
- "mov (%0),%%rcx \n"
- "mov 0x8(%0),%%rdx \n"
- "xor (%1),%%rcx \n"
- "xor 0x8(%1),%%rdx \n"
- "popcnt %%rcx,%%rcx \n"
- "popcnt %%rdx,%%rdx \n"
- "mov 0x10(%0),%%rsi \n"
- "mov 0x18(%0),%%rdi \n"
- "xor 0x10(%1),%%rsi \n"
- "xor 0x18(%1),%%rdi \n"
- "popcnt %%rsi,%%rsi \n"
- "popcnt %%rdi,%%rdi \n"
- "add $0x20,%0 \n"
- "add $0x20,%1 \n"
- "add %%rcx,%3 \n"
- "add %%rdx,%%r8 \n"
- "add %%rsi,%%r9 \n"
- "add %%rdi,%%r10 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "mov (%0),%%rcx \n"
+ "mov 0x8(%0),%%rdx \n"
+ "xor (%1),%%rcx \n"
+ "xor 0x8(%1),%%rdx \n"
+ "popcnt %%rcx,%%rcx \n"
+ "popcnt %%rdx,%%rdx \n"
+ "mov 0x10(%0),%%rsi \n"
+ "mov 0x18(%0),%%rdi \n"
+ "xor 0x10(%1),%%rsi \n"
+ "xor 0x18(%1),%%rdi \n"
+ "popcnt %%rsi,%%rsi \n"
+ "popcnt %%rdi,%%rdi \n"
+ "add $0x20,%0 \n"
+ "add $0x20,%1 \n"
+ "add %%rcx,%3 \n"
+ "add %%rdx,%%r8 \n"
+ "add %%rsi,%%r9 \n"
+ "add %%rdi,%%r10 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
- "add %%r8, %3 \n"
- "add %%r9, %3 \n"
- "add %%r10, %3 \n"
+ "add %%r8, %3 \n"
+ "add %%r9, %3 \n"
+ "add %%r10, %3 \n"
: "+r"(src_a), // %0
"+r"(src_b), // %1
"+r"(count), // %2
@@ -80,26 +80,26 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
// Process 16 bytes per loop.
LABELALIGN
"1: \n"
- "mov (%0),%%ecx \n"
- "mov 0x4(%0),%%edx \n"
- "xor (%1),%%ecx \n"
- "xor 0x4(%1),%%edx \n"
- "popcnt %%ecx,%%ecx \n"
- "add %%ecx,%3 \n"
- "popcnt %%edx,%%edx \n"
- "add %%edx,%3 \n"
- "mov 0x8(%0),%%ecx \n"
- "mov 0xc(%0),%%edx \n"
- "xor 0x8(%1),%%ecx \n"
- "xor 0xc(%1),%%edx \n"
- "popcnt %%ecx,%%ecx \n"
- "add %%ecx,%3 \n"
- "popcnt %%edx,%%edx \n"
- "add %%edx,%3 \n"
- "add $0x10,%0 \n"
- "add $0x10,%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "mov (%0),%%ecx \n"
+ "mov 0x4(%0),%%edx \n"
+ "xor (%1),%%ecx \n"
+ "xor 0x4(%1),%%edx \n"
+ "popcnt %%ecx,%%ecx \n"
+ "add %%ecx,%3 \n"
+ "popcnt %%edx,%%edx \n"
+ "add %%edx,%3 \n"
+ "mov 0x8(%0),%%ecx \n"
+ "mov 0xc(%0),%%edx \n"
+ "xor 0x8(%1),%%ecx \n"
+ "xor 0xc(%1),%%edx \n"
+ "popcnt %%ecx,%%ecx \n"
+ "add %%ecx,%3 \n"
+ "popcnt %%edx,%%edx \n"
+ "add %%edx,%3 \n"
+ "add $0x10,%0 \n"
+ "add $0x10,%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_a), // %0
"+r"(src_b), // %1
"+r"(count), // %2
@@ -121,46 +121,46 @@ uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
uint32_t diff = 0u;
asm volatile(
- "movdqa %4,%%xmm2 \n"
- "movdqa %5,%%xmm3 \n"
- "pxor %%xmm0,%%xmm0 \n"
- "pxor %%xmm1,%%xmm1 \n"
- "sub %0,%1 \n"
+ "movdqa %4,%%xmm2 \n"
+ "movdqa %5,%%xmm3 \n"
+ "pxor %%xmm0,%%xmm0 \n"
+ "pxor %%xmm1,%%xmm1 \n"
+ "sub %0,%1 \n"
LABELALIGN
"1: \n"
- "movdqa (%0),%%xmm4 \n"
- "movdqa 0x10(%0), %%xmm5 \n"
- "pxor (%0,%1), %%xmm4 \n"
- "movdqa %%xmm4,%%xmm6 \n"
- "pand %%xmm2,%%xmm6 \n"
- "psrlw $0x4,%%xmm4 \n"
- "movdqa %%xmm3,%%xmm7 \n"
- "pshufb %%xmm6,%%xmm7 \n"
- "pand %%xmm2,%%xmm4 \n"
- "movdqa %%xmm3,%%xmm6 \n"
- "pshufb %%xmm4,%%xmm6 \n"
- "paddb %%xmm7,%%xmm6 \n"
- "pxor 0x10(%0,%1),%%xmm5 \n"
- "add $0x20,%0 \n"
- "movdqa %%xmm5,%%xmm4 \n"
- "pand %%xmm2,%%xmm5 \n"
- "psrlw $0x4,%%xmm4 \n"
- "movdqa %%xmm3,%%xmm7 \n"
- "pshufb %%xmm5,%%xmm7 \n"
- "pand %%xmm2,%%xmm4 \n"
- "movdqa %%xmm3,%%xmm5 \n"
- "pshufb %%xmm4,%%xmm5 \n"
- "paddb %%xmm7,%%xmm5 \n"
- "paddb %%xmm5,%%xmm6 \n"
- "psadbw %%xmm1,%%xmm6 \n"
- "paddd %%xmm6,%%xmm0 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "movdqa (%0),%%xmm4 \n"
+ "movdqa 0x10(%0), %%xmm5 \n"
+ "pxor (%0,%1), %%xmm4 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "pand %%xmm2,%%xmm6 \n"
+ "psrlw $0x4,%%xmm4 \n"
+ "movdqa %%xmm3,%%xmm7 \n"
+ "pshufb %%xmm6,%%xmm7 \n"
+ "pand %%xmm2,%%xmm4 \n"
+ "movdqa %%xmm3,%%xmm6 \n"
+ "pshufb %%xmm4,%%xmm6 \n"
+ "paddb %%xmm7,%%xmm6 \n"
+ "pxor 0x10(%0,%1),%%xmm5 \n"
+ "add $0x20,%0 \n"
+ "movdqa %%xmm5,%%xmm4 \n"
+ "pand %%xmm2,%%xmm5 \n"
+ "psrlw $0x4,%%xmm4 \n"
+ "movdqa %%xmm3,%%xmm7 \n"
+ "pshufb %%xmm5,%%xmm7 \n"
+ "pand %%xmm2,%%xmm4 \n"
+ "movdqa %%xmm3,%%xmm5 \n"
+ "pshufb %%xmm4,%%xmm5 \n"
+ "paddb %%xmm7,%%xmm5 \n"
+ "paddb %%xmm5,%%xmm6 \n"
+ "psadbw %%xmm1,%%xmm6 \n"
+ "paddd %%xmm6,%%xmm0 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
- "pshufd $0xaa,%%xmm0,%%xmm1 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "movd %%xmm0, %3 \n"
+ "pshufd $0xaa,%%xmm0,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "movd %%xmm0, %3 \n"
: "+r"(src_a), // %0
"+r"(src_b), // %1
"+r"(count), // %2
@@ -182,40 +182,40 @@ uint32_t HammingDistance_AVX2(const uint8_t* src_a,
asm volatile(
"vbroadcastf128 %4,%%ymm2 \n"
"vbroadcastf128 %5,%%ymm3 \n"
- "vpxor %%ymm0,%%ymm0,%%ymm0 \n"
- "vpxor %%ymm1,%%ymm1,%%ymm1 \n"
- "sub %0,%1 \n"
+ "vpxor %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpxor %%ymm1,%%ymm1,%%ymm1 \n"
+ "sub %0,%1 \n"
LABELALIGN
"1: \n"
- "vmovdqa (%0),%%ymm4 \n"
- "vmovdqa 0x20(%0), %%ymm5 \n"
- "vpxor (%0,%1), %%ymm4, %%ymm4 \n"
- "vpand %%ymm2,%%ymm4,%%ymm6 \n"
- "vpsrlw $0x4,%%ymm4,%%ymm4 \n"
- "vpshufb %%ymm6,%%ymm3,%%ymm6 \n"
- "vpand %%ymm2,%%ymm4,%%ymm4 \n"
- "vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
- "vpaddb %%ymm4,%%ymm6,%%ymm6 \n"
- "vpxor 0x20(%0,%1),%%ymm5,%%ymm4 \n"
- "add $0x40,%0 \n"
- "vpand %%ymm2,%%ymm4,%%ymm5 \n"
- "vpsrlw $0x4,%%ymm4,%%ymm4 \n"
- "vpshufb %%ymm5,%%ymm3,%%ymm5 \n"
- "vpand %%ymm2,%%ymm4,%%ymm4 \n"
- "vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
- "vpaddb %%ymm5,%%ymm4,%%ymm4 \n"
- "vpaddb %%ymm6,%%ymm4,%%ymm4 \n"
- "vpsadbw %%ymm1,%%ymm4,%%ymm4 \n"
- "vpaddd %%ymm0,%%ymm4,%%ymm0 \n"
- "sub $0x40,%2 \n"
- "jg 1b \n"
+ "vmovdqa (%0),%%ymm4 \n"
+ "vmovdqa 0x20(%0), %%ymm5 \n"
+ "vpxor (%0,%1), %%ymm4, %%ymm4 \n"
+ "vpand %%ymm2,%%ymm4,%%ymm6 \n"
+ "vpsrlw $0x4,%%ymm4,%%ymm4 \n"
+ "vpshufb %%ymm6,%%ymm3,%%ymm6 \n"
+ "vpand %%ymm2,%%ymm4,%%ymm4 \n"
+ "vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
+ "vpaddb %%ymm4,%%ymm6,%%ymm6 \n"
+ "vpxor 0x20(%0,%1),%%ymm5,%%ymm4 \n"
+ "add $0x40,%0 \n"
+ "vpand %%ymm2,%%ymm4,%%ymm5 \n"
+ "vpsrlw $0x4,%%ymm4,%%ymm4 \n"
+ "vpshufb %%ymm5,%%ymm3,%%ymm5 \n"
+ "vpand %%ymm2,%%ymm4,%%ymm4 \n"
+ "vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
+ "vpaddb %%ymm5,%%ymm4,%%ymm4 \n"
+ "vpaddb %%ymm6,%%ymm4,%%ymm4 \n"
+ "vpsadbw %%ymm1,%%ymm4,%%ymm4 \n"
+ "vpaddd %%ymm0,%%ymm4,%%ymm0 \n"
+ "sub $0x40,%2 \n"
+ "jg 1b \n"
- "vpermq $0xb1,%%ymm0,%%ymm1 \n"
- "vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xaa,%%ymm0,%%ymm1 \n"
- "vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
- "vmovd %%xmm0, %3 \n"
+ "vpermq $0xb1,%%ymm0,%%ymm1 \n"
+ "vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xaa,%%ymm0,%%ymm1 \n"
+ "vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovd %%xmm0, %3 \n"
"vzeroupper \n"
: "+r"(src_a), // %0
"+r"(src_b), // %1
@@ -234,34 +234,34 @@ uint32_t SumSquareError_SSE2(const uint8_t* src_a,
int count) {
uint32_t sse;
asm volatile(
- "pxor %%xmm0,%%xmm0 \n"
- "pxor %%xmm5,%%xmm5 \n"
+ "pxor %%xmm0,%%xmm0 \n"
+ "pxor %%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "movdqu (%1),%%xmm2 \n"
- "lea 0x10(%1),%1 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "psubusb %%xmm2,%%xmm1 \n"
- "psubusb %%xmm3,%%xmm2 \n"
- "por %%xmm2,%%xmm1 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "punpckhbw %%xmm5,%%xmm2 \n"
- "pmaddwd %%xmm1,%%xmm1 \n"
- "pmaddwd %%xmm2,%%xmm2 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "paddd %%xmm2,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqu (%1),%%xmm2 \n"
+ "lea 0x10(%1),%1 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "psubusb %%xmm2,%%xmm1 \n"
+ "psubusb %%xmm3,%%xmm2 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpckhbw %%xmm5,%%xmm2 \n"
+ "pmaddwd %%xmm1,%%xmm1 \n"
+ "pmaddwd %%xmm2,%%xmm2 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "paddd %%xmm2,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
- "pshufd $0xee,%%xmm0,%%xmm1 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "pshufd $0x1,%%xmm0,%%xmm1 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "movd %%xmm0,%3 \n"
+ "pshufd $0xee,%%xmm0,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "pshufd $0x1,%%xmm0,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "movd %%xmm0,%3 \n"
: "+r"(src_a), // %0
"+r"(src_b), // %1
@@ -301,44 +301,44 @@ static const uvec32 kHashMul3 = {
uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
uint32_t hash;
asm volatile(
- "movd %2,%%xmm0 \n"
- "pxor %%xmm7,%%xmm7 \n"
- "movdqa %4,%%xmm6 \n"
+ "movd %2,%%xmm0 \n"
+ "pxor %%xmm7,%%xmm7 \n"
+ "movdqa %4,%%xmm6 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "pmulld %%xmm6,%%xmm0 \n"
- "movdqa %5,%%xmm5 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm7,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "punpcklwd %%xmm7,%%xmm3 \n"
- "pmulld %%xmm5,%%xmm3 \n"
- "movdqa %6,%%xmm5 \n"
- "movdqa %%xmm2,%%xmm4 \n"
- "punpckhwd %%xmm7,%%xmm4 \n"
- "pmulld %%xmm5,%%xmm4 \n"
- "movdqa %7,%%xmm5 \n"
- "punpckhbw %%xmm7,%%xmm1 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklwd %%xmm7,%%xmm2 \n"
- "pmulld %%xmm5,%%xmm2 \n"
- "movdqa %8,%%xmm5 \n"
- "punpckhwd %%xmm7,%%xmm1 \n"
- "pmulld %%xmm5,%%xmm1 \n"
- "paddd %%xmm4,%%xmm3 \n"
- "paddd %%xmm2,%%xmm1 \n"
- "paddd %%xmm3,%%xmm1 \n"
- "pshufd $0xe,%%xmm1,%%xmm2 \n"
- "paddd %%xmm2,%%xmm1 \n"
- "pshufd $0x1,%%xmm1,%%xmm2 \n"
- "paddd %%xmm2,%%xmm1 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "sub $0x10,%1 \n"
- "jg 1b \n"
- "movd %%xmm0,%3 \n"
+ "movdqu (%0),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "pmulld %%xmm6,%%xmm0 \n"
+ "movdqa %5,%%xmm5 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "punpcklwd %%xmm7,%%xmm3 \n"
+ "pmulld %%xmm5,%%xmm3 \n"
+ "movdqa %6,%%xmm5 \n"
+ "movdqa %%xmm2,%%xmm4 \n"
+ "punpckhwd %%xmm7,%%xmm4 \n"
+ "pmulld %%xmm5,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "punpckhbw %%xmm7,%%xmm1 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklwd %%xmm7,%%xmm2 \n"
+ "pmulld %%xmm5,%%xmm2 \n"
+ "movdqa %8,%%xmm5 \n"
+ "punpckhwd %%xmm7,%%xmm1 \n"
+ "pmulld %%xmm5,%%xmm1 \n"
+ "paddd %%xmm4,%%xmm3 \n"
+ "paddd %%xmm2,%%xmm1 \n"
+ "paddd %%xmm3,%%xmm1 \n"
+ "pshufd $0xe,%%xmm1,%%xmm2 \n"
+ "paddd %%xmm2,%%xmm1 \n"
+ "pshufd $0x1,%%xmm1,%%xmm2 \n"
+ "paddd %%xmm2,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "sub $0x10,%1 \n"
+ "jg 1b \n"
+ "movd %%xmm0,%3 \n"
: "+r"(src), // %0
"+r"(count), // %1
"+rm"(seed), // %2
diff --git a/TMessagesProj/jni/third_party/libyuv/source/compare_neon.cc b/TMessagesProj/jni/third_party/libyuv/source/compare_neon.cc
index 2a2181e0c..afdd60121 100644
--- a/TMessagesProj/jni/third_party/libyuv/source/compare_neon.cc
+++ b/TMessagesProj/jni/third_party/libyuv/source/compare_neon.cc
@@ -29,24 +29,24 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a,
uint32_t diff;
asm volatile(
- "vmov.u16 q4, #0 \n" // accumulator
+ "vmov.u16 q4, #0 \n" // accumulator
"1: \n"
- "vld1.8 {q0, q1}, [%0]! \n"
- "vld1.8 {q2, q3}, [%1]! \n"
- "veor.32 q0, q0, q2 \n"
- "veor.32 q1, q1, q3 \n"
- "vcnt.i8 q0, q0 \n"
- "vcnt.i8 q1, q1 \n"
- "subs %2, %2, #32 \n"
- "vadd.u8 q0, q0, q1 \n" // 16 byte counts
- "vpadal.u8 q4, q0 \n" // 8 shorts
- "bgt 1b \n"
+ "vld1.8 {q0, q1}, [%0]! \n"
+ "vld1.8 {q2, q3}, [%1]! \n"
+ "veor.32 q0, q0, q2 \n"
+ "veor.32 q1, q1, q3 \n"
+ "vcnt.i8 q0, q0 \n"
+ "vcnt.i8 q1, q1 \n"
+ "subs %2, %2, #32 \n"
+ "vadd.u8 q0, q0, q1 \n" // 16 byte counts
+ "vpadal.u8 q4, q0 \n" // 8 shorts
+ "bgt 1b \n"
- "vpaddl.u16 q0, q4 \n" // 4 ints
- "vpadd.u32 d0, d0, d1 \n"
- "vpadd.u32 d0, d0, d0 \n"
- "vmov.32 %3, d0[0] \n"
+ "vpaddl.u16 q0, q4 \n" // 4 ints
+ "vpadd.u32 d0, d0, d1 \n"
+ "vpadd.u32 d0, d0, d0 \n"
+ "vmov.32 %3, d0[0] \n"
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
:
@@ -59,29 +59,29 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
int count) {
uint32_t sse;
asm volatile(
- "vmov.u8 q8, #0 \n"
- "vmov.u8 q10, #0 \n"
- "vmov.u8 q9, #0 \n"
- "vmov.u8 q11, #0 \n"
+ "vmov.u8 q8, #0 \n"
+ "vmov.u8 q10, #0 \n"
+ "vmov.u8 q9, #0 \n"
+ "vmov.u8 q11, #0 \n"
"1: \n"
- "vld1.8 {q0}, [%0]! \n"
- "vld1.8 {q1}, [%1]! \n"
- "subs %2, %2, #16 \n"
- "vsubl.u8 q2, d0, d2 \n"
- "vsubl.u8 q3, d1, d3 \n"
- "vmlal.s16 q8, d4, d4 \n"
- "vmlal.s16 q9, d6, d6 \n"
- "vmlal.s16 q10, d5, d5 \n"
- "vmlal.s16 q11, d7, d7 \n"
- "bgt 1b \n"
+ "vld1.8 {q0}, [%0]! \n"
+ "vld1.8 {q1}, [%1]! \n"
+ "subs %2, %2, #16 \n"
+ "vsubl.u8 q2, d0, d2 \n"
+ "vsubl.u8 q3, d1, d3 \n"
+ "vmlal.s16 q8, d4, d4 \n"
+ "vmlal.s16 q9, d6, d6 \n"
+ "vmlal.s16 q10, d5, d5 \n"
+ "vmlal.s16 q11, d7, d7 \n"
+ "bgt 1b \n"
- "vadd.u32 q8, q8, q9 \n"
- "vadd.u32 q10, q10, q11 \n"
- "vadd.u32 q11, q8, q10 \n"
- "vpaddl.u32 q1, q11 \n"
- "vadd.u64 d0, d2, d3 \n"
- "vmov.32 %3, d0[0] \n"
+ "vadd.u32 q8, q8, q9 \n"
+ "vadd.u32 q10, q10, q11 \n"
+ "vadd.u32 q11, q8, q10 \n"
+ "vpaddl.u32 q1, q11 \n"
+ "vadd.u64 d0, d2, d3 \n"
+ "vmov.32 %3, d0[0] \n"
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
:
: "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
diff --git a/TMessagesProj/jni/third_party/libyuv/source/compare_neon64.cc b/TMessagesProj/jni/third_party/libyuv/source/compare_neon64.cc
index 6e8f672ab..70fb9b914 100644
--- a/TMessagesProj/jni/third_party/libyuv/source/compare_neon64.cc
+++ b/TMessagesProj/jni/third_party/libyuv/source/compare_neon64.cc
@@ -27,22 +27,24 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a,
int count) {
uint32_t diff;
asm volatile(
- "movi v4.8h, #0 \n"
+ "movi v4.8h, #0 \n"
"1: \n"
- "ld1 {v0.16b, v1.16b}, [%0], #32 \n"
- "ld1 {v2.16b, v3.16b}, [%1], #32 \n"
- "eor v0.16b, v0.16b, v2.16b \n"
- "eor v1.16b, v1.16b, v3.16b \n"
- "cnt v0.16b, v0.16b \n"
- "cnt v1.16b, v1.16b \n"
- "subs %w2, %w2, #32 \n"
- "add v0.16b, v0.16b, v1.16b \n"
- "uadalp v4.8h, v0.16b \n"
- "b.gt 1b \n"
+ "ld1 {v0.16b, v1.16b}, [%0], #32 \n"
+ "ld1 {v2.16b, v3.16b}, [%1], #32 \n"
+ "eor v0.16b, v0.16b, v2.16b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "eor v1.16b, v1.16b, v3.16b \n"
+ "cnt v0.16b, v0.16b \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "cnt v1.16b, v1.16b \n"
+ "subs %w2, %w2, #32 \n"
+ "add v0.16b, v0.16b, v1.16b \n"
+ "uadalp v4.8h, v0.16b \n"
+ "b.gt 1b \n"
- "uaddlv s4, v4.8h \n"
- "fmov %w3, s4 \n"
+ "uaddlv s4, v4.8h \n"
+ "fmov %w3, s4 \n"
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
:
: "cc", "v0", "v1", "v2", "v3", "v4");
@@ -54,28 +56,30 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
int count) {
uint32_t sse;
asm volatile(
- "eor v16.16b, v16.16b, v16.16b \n"
- "eor v18.16b, v18.16b, v18.16b \n"
- "eor v17.16b, v17.16b, v17.16b \n"
- "eor v19.16b, v19.16b, v19.16b \n"
+ "eor v16.16b, v16.16b, v16.16b \n"
+ "eor v18.16b, v18.16b, v18.16b \n"
+ "eor v17.16b, v17.16b, v17.16b \n"
+ "eor v19.16b, v19.16b, v19.16b \n"
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n"
- "ld1 {v1.16b}, [%1], #16 \n"
- "subs %w2, %w2, #16 \n"
- "usubl v2.8h, v0.8b, v1.8b \n"
- "usubl2 v3.8h, v0.16b, v1.16b \n"
- "smlal v16.4s, v2.4h, v2.4h \n"
- "smlal v17.4s, v3.4h, v3.4h \n"
- "smlal2 v18.4s, v2.8h, v2.8h \n"
- "smlal2 v19.4s, v3.8h, v3.8h \n"
- "b.gt 1b \n"
+ "ld1 {v0.16b}, [%0], #16 \n"
+ "ld1 {v1.16b}, [%1], #16 \n"
+ "subs %w2, %w2, #16 \n"
+ "usubl v2.8h, v0.8b, v1.8b \n"
+ "usubl2 v3.8h, v0.16b, v1.16b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "smlal v16.4s, v2.4h, v2.4h \n"
+ "smlal v17.4s, v3.4h, v3.4h \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "smlal2 v18.4s, v2.8h, v2.8h \n"
+ "smlal2 v19.4s, v3.8h, v3.8h \n"
+ "b.gt 1b \n"
- "add v16.4s, v16.4s, v17.4s \n"
- "add v18.4s, v18.4s, v19.4s \n"
- "add v19.4s, v16.4s, v18.4s \n"
- "addv s0, v19.4s \n"
- "fmov %w3, s0 \n"
+ "add v16.4s, v16.4s, v17.4s \n"
+ "add v18.4s, v18.4s, v19.4s \n"
+ "add v19.4s, v16.4s, v18.4s \n"
+ "addv s0, v19.4s \n"
+ "fmov %w3, s0 \n"
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
:
: "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
diff --git a/TMessagesProj/jni/third_party/libyuv/source/convert.cc b/TMessagesProj/jni/third_party/libyuv/source/convert.cc
index 614fa4824..98258b9bc 100644
--- a/TMessagesProj/jni/third_party/libyuv/source/convert.cc
+++ b/TMessagesProj/jni/third_party/libyuv/source/convert.cc
@@ -320,14 +320,6 @@ int I422ToNV21(const uint8_t* src_y,
}
}
#endif
-#if defined(HAS_MERGEUVROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- MergeUVRow = MergeUVRow_Any_MSA;
- if (IS_ALIGNED(halfwidth, 16)) {
- MergeUVRow = MergeUVRow_MSA;
- }
- }
-#endif
#if defined(HAS_MERGEUVROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
MergeUVRow = MergeUVRow_Any_MMI;
@@ -336,6 +328,14 @@ int I422ToNV21(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_MERGEUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MergeUVRow = MergeUVRow_Any_MSA;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow = MergeUVRow_MSA;
+ }
+ }
+#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -360,14 +360,6 @@ int I422ToNV21(const uint8_t* src_y,
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- InterpolateRow = InterpolateRow_Any_MSA;
- if (IS_ALIGNED(width, 32)) {
- InterpolateRow = InterpolateRow_MSA;
- }
- }
-#endif
#if defined(HAS_INTERPOLATEROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
InterpolateRow = InterpolateRow_Any_MMI;
@@ -376,6 +368,14 @@ int I422ToNV21(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
if (dst_y) {
CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, halfwidth, height);
@@ -426,7 +426,41 @@ int I444ToI420(const uint8_t* src_y,
dst_v, dst_stride_v, width, height, width, height);
}
-// TODO(fbarchard): Implement row conversion.
+LIBYUV_API
+int I444ToNV12(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+ HalfMergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv,
+ dst_stride_uv, width, height);
+ return 0;
+}
+
LIBYUV_API
int I444ToNV21(const uint8_t* src_y,
int src_stride_y,
@@ -440,30 +474,9 @@ int I444ToNV21(const uint8_t* src_y,
int dst_stride_vu,
int width,
int height) {
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- halfheight = (height + 1) >> 1;
- src_y = src_y + (height - 1) * src_stride_y;
- src_u = src_u + (height - 1) * src_stride_u;
- src_v = src_v + (height - 1) * src_stride_v;
- src_stride_y = -src_stride_y;
- src_stride_u = -src_stride_u;
- src_stride_v = -src_stride_v;
- }
- // Allocate u and v buffers
- align_buffer_64(plane_u, halfwidth * halfheight * 2);
- uint8_t* plane_v = plane_u + halfwidth * halfheight;
-
- I444ToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
- dst_y, dst_stride_y, plane_u, halfwidth, plane_v, halfwidth, width,
- height);
- MergeUVPlane(plane_v, halfwidth, plane_u, halfwidth, dst_vu, dst_stride_vu,
- halfwidth, halfheight);
- free_aligned_buffer_64(plane_u);
- return 0;
+ return I444ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u,
+ src_stride_u, dst_y, dst_stride_y, dst_vu, dst_stride_vu,
+ width, height);
}
// I400 is greyscale typically used in MJPG
@@ -527,120 +540,8 @@ int I400ToNV21(const uint8_t* src_y,
return 0;
}
-static void CopyPlane2(const uint8_t* src,
- int src_stride_0,
- int src_stride_1,
- uint8_t* dst,
- int dst_stride,
- int width,
- int height) {
- int y;
- void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C;
-#if defined(HAS_COPYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
- }
-#endif
-#if defined(HAS_COPYROW_AVX)
- if (TestCpuFlag(kCpuHasAVX)) {
- CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
- }
-#endif
-#if defined(HAS_COPYROW_ERMS)
- if (TestCpuFlag(kCpuHasERMS)) {
- CopyRow = CopyRow_ERMS;
- }
-#endif
-#if defined(HAS_COPYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
- }
-#endif
-
- // Copy plane
- for (y = 0; y < height - 1; y += 2) {
- CopyRow(src, dst, width);
- CopyRow(src + src_stride_0, dst + dst_stride, width);
- src += src_stride_0 + src_stride_1;
- dst += dst_stride * 2;
- }
- if (height & 1) {
- CopyRow(src, dst, width);
- }
-}
-
-// Support converting from FOURCC_M420
-// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for
-// easy conversion to I420.
-// M420 format description:
-// M420 is row biplanar 420: 2 rows of Y and 1 row of UV.
-// Chroma is half width / half height. (420)
-// src_stride_m420 is row planar. Normally this will be the width in pixels.
-// The UV plane is half width, but 2 values, so src_stride_m420 applies to
-// this as well as the two Y planes.
-static int X420ToI420(const uint8_t* src_y,
- int src_stride_y0,
- int src_stride_y1,
- const uint8_t* src_uv,
- int src_stride_uv,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height) {
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
- if (!src_uv || !dst_u || !dst_v || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- halfheight = (height + 1) >> 1;
- if (dst_y) {
- dst_y = dst_y + (height - 1) * dst_stride_y;
- }
- dst_u = dst_u + (halfheight - 1) * dst_stride_u;
- dst_v = dst_v + (halfheight - 1) * dst_stride_v;
- dst_stride_y = -dst_stride_y;
- dst_stride_u = -dst_stride_u;
- dst_stride_v = -dst_stride_v;
- }
- // Coalesce rows.
- if (src_stride_y0 == width && src_stride_y1 == width &&
- dst_stride_y == width) {
- width *= height;
- height = 1;
- src_stride_y0 = src_stride_y1 = dst_stride_y = 0;
- }
- // Coalesce rows.
- if (src_stride_uv == halfwidth * 2 && dst_stride_u == halfwidth &&
- dst_stride_v == halfwidth) {
- halfwidth *= halfheight;
- halfheight = 1;
- src_stride_uv = dst_stride_u = dst_stride_v = 0;
- }
-
- if (dst_y) {
- if (src_stride_y0 == src_stride_y1) {
- CopyPlane(src_y, src_stride_y0, dst_y, dst_stride_y, width, height);
- } else {
- CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y,
- width, height);
- }
- }
-
- // Split UV plane - NV12 / NV21
- SplitUVPlane(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, dst_stride_v,
- halfwidth, halfheight);
-
- return 0;
-}
-
// Convert NV12 to I420.
+// TODO(fbarchard): Consider inverting destination. Faster on ARM with prfm.
LIBYUV_API
int NV12ToI420(const uint8_t* src_y,
int src_stride_y,
@@ -654,9 +555,43 @@ int NV12ToI420(const uint8_t* src_y,
int dst_stride_v,
int width,
int height) {
- return X420ToI420(src_y, src_stride_y, src_stride_y, src_uv, src_stride_uv,
- dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
- dst_stride_v, width, height);
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_uv || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_uv = src_uv + (halfheight - 1) * src_stride_uv;
+ src_stride_y = -src_stride_y;
+ src_stride_uv = -src_stride_uv;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width && dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ src_stride_y = dst_stride_y = 0;
+ }
+ // Coalesce rows.
+ if (src_stride_uv == halfwidth * 2 && dst_stride_u == halfwidth &&
+ dst_stride_v == halfwidth) {
+ halfwidth *= halfheight;
+ halfheight = 1;
+ src_stride_uv = dst_stride_u = dst_stride_v = 0;
+ }
+
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+
+ // Split UV plane - NV12 / NV21
+ SplitUVPlane(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, dst_stride_v,
+ halfwidth, halfheight);
+
+ return 0;
}
// Convert NV21 to I420. Same as NV12 but u and v pointers swapped.
@@ -673,26 +608,8 @@ int NV21ToI420(const uint8_t* src_y,
int dst_stride_v,
int width,
int height) {
- return X420ToI420(src_y, src_stride_y, src_stride_y, src_vu, src_stride_vu,
- dst_y, dst_stride_y, dst_v, dst_stride_v, dst_u,
- dst_stride_u, width, height);
-}
-
-// Convert M420 to I420.
-LIBYUV_API
-int M420ToI420(const uint8_t* src_m420,
- int src_stride_m420,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height) {
- return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2,
- src_m420 + src_stride_m420 * 2, src_stride_m420 * 3, dst_y,
- dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,
+ return NV12ToI420(src_y, src_stride_y, src_vu, src_stride_vu, dst_y,
+ dst_stride_y, dst_v, dst_stride_v, dst_u, dst_stride_u,
width, height);
}
@@ -750,17 +667,7 @@ int YUY2ToI420(const uint8_t* src_yuy2,
}
}
#endif
-#if defined(HAS_YUY2TOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- YUY2ToYRow = YUY2ToYRow_Any_MSA;
- YUY2ToUVRow = YUY2ToUVRow_Any_MSA;
- if (IS_ALIGNED(width, 32)) {
- YUY2ToYRow = YUY2ToYRow_MSA;
- YUY2ToUVRow = YUY2ToUVRow_MSA;
- }
- }
-#endif
-#if defined(HAS_YUY2TOYROW_MMI)
+#if defined(HAS_YUY2TOYROW_MMI) && defined(HAS_YUY2TOUVROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
YUY2ToYRow = YUY2ToYRow_Any_MMI;
YUY2ToUVRow = YUY2ToUVRow_Any_MMI;
@@ -772,6 +679,16 @@ int YUY2ToI420(const uint8_t* src_yuy2,
}
}
#endif
+#if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ YUY2ToYRow = YUY2ToYRow_Any_MSA;
+ YUY2ToUVRow = YUY2ToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ YUY2ToYRow = YUY2ToYRow_MSA;
+ YUY2ToUVRow = YUY2ToUVRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height - 1; y += 2) {
YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
@@ -843,6 +760,16 @@ int UYVYToI420(const uint8_t* src_uyvy,
}
}
#endif
+#if defined(HAS_UYVYTOYROW_MMI) && defined(HAS_UYVYTOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ UYVYToYRow = UYVYToYRow_Any_MMI;
+ UYVYToUVRow = UYVYToUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToYRow = UYVYToYRow_MMI;
+ UYVYToUVRow = UYVYToUVRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_UYVYTOYROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
UYVYToYRow = UYVYToYRow_Any_MSA;
@@ -853,16 +780,6 @@ int UYVYToI420(const uint8_t* src_uyvy,
}
}
#endif
-#if defined(HAS_UYVYTOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- UYVYToYRow = UYVYToYRow_Any_MMI;
- UYVYToUVRow = UYVYToUVRow_Any_MMI;
- if (IS_ALIGNED(width, 16)) {
- UYVYToYRow = UYVYToYRow_MMI;
- UYVYToUVRow = UYVYToUVRow_MMI;
- }
- }
-#endif
for (y = 0; y < height - 1; y += 2) {
UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
@@ -1081,38 +998,30 @@ int ARGBToI420(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToYRow = ARGBToYRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_MSA;
- }
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToUVRow = ARGBToUVRow_Any_MSA;
- if (IS_ALIGNED(width, 32)) {
- ARGBToUVRow = ARGBToUVRow_MSA;
- }
- }
-#endif
-#if defined(HAS_ARGBTOYROW_MMI)
+#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ARGBToYRow = ARGBToYRow_Any_MMI;
+ ARGBToUVRow = ARGBToUVRow_Any_MMI;
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_MMI;
}
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToUVRow = ARGBToUVRow_Any_MMI;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_MMI;
}
}
#endif
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
+ ARGBToUVRow = ARGBToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ }
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height - 1; y += 2) {
ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);
@@ -1183,38 +1092,28 @@ int BGRAToI420(const uint8_t* src_bgra,
}
}
#endif
-#if defined(HAS_BGRATOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- BGRAToYRow = BGRAToYRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- BGRAToYRow = BGRAToYRow_MSA;
- }
- }
-#endif
-#if defined(HAS_BGRATOUVROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- BGRAToUVRow = BGRAToUVRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- BGRAToUVRow = BGRAToUVRow_MSA;
- }
- }
-#endif
-#if defined(HAS_BGRATOYROW_MMI)
+#if defined(HAS_BGRATOYROW_MMI) && defined(HAS_BGRATOUVROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
BGRAToYRow = BGRAToYRow_Any_MMI;
+ BGRAToUVRow = BGRAToUVRow_Any_MMI;
if (IS_ALIGNED(width, 8)) {
BGRAToYRow = BGRAToYRow_MMI;
}
- }
-#endif
-#if defined(HAS_BGRATOUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- BGRAToUVRow = BGRAToUVRow_Any_MMI;
if (IS_ALIGNED(width, 16)) {
BGRAToUVRow = BGRAToUVRow_MMI;
}
}
#endif
+#if defined(HAS_BGRATOYROW_MSA) && defined(HAS_BGRATOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ BGRAToYRow = BGRAToYRow_Any_MSA;
+ BGRAToUVRow = BGRAToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ BGRAToYRow = BGRAToYRow_MSA;
+ BGRAToUVRow = BGRAToUVRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height - 1; y += 2) {
BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width);
@@ -1269,6 +1168,16 @@ int ABGRToI420(const uint8_t* src_abgr,
}
}
#endif
+#if defined(HAS_ABGRTOYROW_AVX2) && defined(HAS_ABGRTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ABGRToUVRow = ABGRToUVRow_Any_AVX2;
+ ABGRToYRow = ABGRToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ABGRToUVRow = ABGRToUVRow_AVX2;
+ ABGRToYRow = ABGRToYRow_AVX2;
+ }
+ }
+#endif
#if defined(HAS_ABGRTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ABGRToYRow = ABGRToYRow_Any_NEON;
@@ -1285,38 +1194,28 @@ int ABGRToI420(const uint8_t* src_abgr,
}
}
#endif
-#if defined(HAS_ABGRTOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ABGRToYRow = ABGRToYRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- ABGRToYRow = ABGRToYRow_MSA;
- }
- }
-#endif
-#if defined(HAS_ABGRTOUVROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ABGRToUVRow = ABGRToUVRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- ABGRToUVRow = ABGRToUVRow_MSA;
- }
- }
-#endif
-#if defined(HAS_ABGRTOYROW_MMI)
+#if defined(HAS_ABGRTOYROW_MMI) && defined(HAS_ABGRTOUVROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ABGRToYRow = ABGRToYRow_Any_MMI;
+ ABGRToUVRow = ABGRToUVRow_Any_MMI;
if (IS_ALIGNED(width, 8)) {
ABGRToYRow = ABGRToYRow_MMI;
}
- }
-#endif
-#if defined(HAS_ABGRTOUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ABGRToUVRow = ABGRToUVRow_Any_MMI;
if (IS_ALIGNED(width, 16)) {
ABGRToUVRow = ABGRToUVRow_MMI;
}
}
#endif
+#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ABGRToYRow = ABGRToYRow_Any_MSA;
+ ABGRToUVRow = ABGRToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToYRow = ABGRToYRow_MSA;
+ ABGRToUVRow = ABGRToUVRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height - 1; y += 2) {
ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width);
@@ -1387,38 +1286,28 @@ int RGBAToI420(const uint8_t* src_rgba,
}
}
#endif
-#if defined(HAS_RGBATOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- RGBAToYRow = RGBAToYRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- RGBAToYRow = RGBAToYRow_MSA;
- }
- }
-#endif
-#if defined(HAS_RGBATOUVROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- RGBAToUVRow = RGBAToUVRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- RGBAToUVRow = RGBAToUVRow_MSA;
- }
- }
-#endif
-#if defined(HAS_RGBATOYROW_MMI)
+#if defined(HAS_RGBATOYROW_MMI) && defined(HAS_RGBATOUVROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
RGBAToYRow = RGBAToYRow_Any_MMI;
+ RGBAToUVRow = RGBAToUVRow_Any_MMI;
if (IS_ALIGNED(width, 8)) {
RGBAToYRow = RGBAToYRow_MMI;
}
- }
-#endif
-#if defined(HAS_RGBATOUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- RGBAToUVRow = RGBAToUVRow_Any_MMI;
if (IS_ALIGNED(width, 16)) {
RGBAToUVRow = RGBAToUVRow_MMI;
}
}
#endif
+#if defined(HAS_RGBATOYROW_MSA) && defined(HAS_RGBATOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RGBAToYRow = RGBAToYRow_Any_MSA;
+ RGBAToUVRow = RGBAToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RGBAToYRow = RGBAToYRow_MSA;
+ RGBAToUVRow = RGBAToUVRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height - 1; y += 2) {
RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width);
@@ -1487,16 +1376,9 @@ int RGB24ToI420(const uint8_t* src_rgb24,
}
}
}
-#elif defined(HAS_RGB24TOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- RGB24ToUVRow = RGB24ToUVRow_Any_MSA;
- RGB24ToYRow = RGB24ToYRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- RGB24ToYRow = RGB24ToYRow_MSA;
- RGB24ToUVRow = RGB24ToUVRow_MSA;
- }
- }
-#elif defined(HAS_RGB24TOYROW_MMI)
+// MMI and MSA version does direct RGB24 to YUV.
+#elif (defined(HAS_RGB24TOYROW_MMI) || defined(HAS_RGB24TOYROW_MSA))
+#if defined(HAS_RGB24TOYROW_MMI) && defined(HAS_RGB24TOUVROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
RGB24ToUVRow = RGB24ToUVRow_Any_MMI;
RGB24ToYRow = RGB24ToYRow_Any_MMI;
@@ -1507,6 +1389,17 @@ int RGB24ToI420(const uint8_t* src_rgb24,
}
}
}
+#endif
+#if defined(HAS_RGB24TOYROW_MSA) && defined(HAS_RGB24TOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RGB24ToUVRow = RGB24ToUVRow_Any_MSA;
+ RGB24ToYRow = RGB24ToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToYRow = RGB24ToYRow_MSA;
+ RGB24ToUVRow = RGB24ToUVRow_MSA;
+ }
+ }
+#endif
// Other platforms do intermediate conversion from RGB24 to ARGB.
#else
#if defined(HAS_RGB24TOARGBROW_SSSE3)
@@ -1598,8 +1491,8 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
int width,
int height) {
int y;
-#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
- defined(HAS_RGB24TOYJROW_MMI))
+#if (defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \
+ defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI)
void (*RGB24ToUVJRow)(const uint8_t* src_rgb24, int src_stride_rgb24,
uint8_t* dst_u, uint8_t* dst_v, int width) =
RGB24ToUVJRow_C;
@@ -1625,7 +1518,7 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
}
// Neon version does direct RGB24 to YUV.
-#if defined(HAS_RGB24TOYJROW_NEON)
+#if defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
RGB24ToUVJRow = RGB24ToUVJRow_Any_NEON;
RGB24ToYJRow = RGB24ToYJRow_Any_NEON;
@@ -1636,16 +1529,9 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
}
}
}
-#elif defined(HAS_RGB24TOYJROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- RGB24ToUVJRow = RGB24ToUVJRow_Any_MSA;
- RGB24ToYJRow = RGB24ToYJRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- RGB24ToYJRow = RGB24ToYJRow_MSA;
- RGB24ToUVJRow = RGB24ToUVJRow_MSA;
- }
- }
-#elif defined(HAS_RGB24TOYJROW_MMI)
+// MMI and MSA version does direct RGB24 to YUV.
+#elif (defined(HAS_RGB24TOYJROW_MMI) || defined(HAS_RGB24TOYJROW_MSA))
+#if defined(HAS_RGB24TOYJROW_MMI) && defined(HAS_RGB24TOUVJROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
RGB24ToUVJRow = RGB24ToUVJRow_Any_MMI;
RGB24ToYJRow = RGB24ToYJRow_Any_MMI;
@@ -1656,7 +1542,17 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
}
}
}
-// Other platforms do intermediate conversion from RGB24 to ARGB.
+#endif
+#if defined(HAS_RGB24TOYJROW_MSA) && defined(HAS_RGB24TOUVJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RGB24ToUVJRow = RGB24ToUVJRow_Any_MSA;
+ RGB24ToYJRow = RGB24ToYJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToYJRow = RGB24ToYJRow_MSA;
+ RGB24ToUVJRow = RGB24ToUVJRow_MSA;
+ }
+ }
+#endif
#else
#if defined(HAS_RGB24TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
@@ -1689,16 +1585,16 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
#endif
{
-#if !(defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
- defined(HAS_RGB24TOYJROW_MMI))
+#if !((defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \
+ defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI))
// Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2);
#endif
for (y = 0; y < height - 1; y += 2) {
-#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
- defined(HAS_RGB24TOYJROW_MMI))
+#if ((defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \
+ defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI))
RGB24ToUVJRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
RGB24ToYJRow(src_rgb24, dst_y, width);
RGB24ToYJRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
@@ -1715,8 +1611,8 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
dst_v += dst_stride_v;
}
if (height & 1) {
-#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
- defined(HAS_RGB24TOYJROW_MMI))
+#if ((defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \
+ defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI))
RGB24ToUVJRow(src_rgb24, 0, dst_u, dst_v, width);
RGB24ToYJRow(src_rgb24, dst_y, width);
#else
@@ -1725,8 +1621,8 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
ARGBToYJRow(row, dst_y, width);
#endif
}
-#if !(defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
- defined(HAS_RGB24TOYJROW_MMI))
+#if !((defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \
+ defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI))
free_aligned_buffer_64(row);
#endif
}
@@ -1746,8 +1642,8 @@ int RAWToI420(const uint8_t* src_raw,
int width,
int height) {
int y;
-#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
- defined(HAS_RAWTOYROW_MMI))
+#if (defined(HAS_RAWTOYROW_NEON) && defined(HAS_RAWTOUVROW_NEON)) || \
+ defined(HAS_RAWTOYROW_MSA) || defined(HAS_RAWTOYROW_MMI)
void (*RAWToUVRow)(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_u,
uint8_t* dst_v, int width) = RAWToUVRow_C;
void (*RAWToYRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) =
@@ -1772,7 +1668,7 @@ int RAWToI420(const uint8_t* src_raw,
}
// Neon version does direct RAW to YUV.
-#if defined(HAS_RAWTOYROW_NEON)
+#if defined(HAS_RAWTOYROW_NEON) && defined(HAS_RAWTOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
RAWToUVRow = RAWToUVRow_Any_NEON;
RAWToYRow = RAWToYRow_Any_NEON;
@@ -1783,16 +1679,9 @@ int RAWToI420(const uint8_t* src_raw,
}
}
}
-#elif defined(HAS_RAWTOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- RAWToUVRow = RAWToUVRow_Any_MSA;
- RAWToYRow = RAWToYRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- RAWToYRow = RAWToYRow_MSA;
- RAWToUVRow = RAWToUVRow_MSA;
- }
- }
-#elif defined(HAS_RAWTOYROW_MMI)
+// MMI and MSA version does direct RAW to YUV.
+#elif (defined(HAS_RAWTOYROW_MMI) || defined(HAS_RAWTOYROW_MSA))
+#if defined(HAS_RAWTOYROW_MMI) && defined(HAS_RAWTOUVROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
RAWToUVRow = RAWToUVRow_Any_MMI;
RAWToYRow = RAWToYRow_Any_MMI;
@@ -1803,6 +1692,17 @@ int RAWToI420(const uint8_t* src_raw,
}
}
}
+#endif
+#if defined(HAS_RAWTOYROW_MSA) && defined(HAS_RAWTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RAWToUVRow = RAWToUVRow_Any_MSA;
+ RAWToYRow = RAWToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToYRow = RAWToYRow_MSA;
+ RAWToUVRow = RAWToUVRow_MSA;
+ }
+ }
+#endif
// Other platforms do intermediate conversion from RAW to ARGB.
#else
#if defined(HAS_RAWTOARGBROW_SSSE3)
@@ -1931,16 +1831,9 @@ int RGB565ToI420(const uint8_t* src_rgb565,
}
}
}
-#elif defined(HAS_RGB565TOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- RGB565ToUVRow = RGB565ToUVRow_Any_MSA;
- RGB565ToYRow = RGB565ToYRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- RGB565ToYRow = RGB565ToYRow_MSA;
- RGB565ToUVRow = RGB565ToUVRow_MSA;
- }
- }
-#elif defined(HAS_RGB565TOYROW_MMI)
+// MMI and MSA version does direct RGB565 to YUV.
+#elif (defined(HAS_RGB565TOYROW_MMI) || defined(HAS_RGB565TOYROW_MSA))
+#if defined(HAS_RGB565TOYROW_MMI) && defined(HAS_RGB565TOUVROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
RGB565ToUVRow = RGB565ToUVRow_Any_MMI;
RGB565ToYRow = RGB565ToYRow_Any_MMI;
@@ -1951,6 +1844,17 @@ int RGB565ToI420(const uint8_t* src_rgb565,
}
}
}
+#endif
+#if defined(HAS_RGB565TOYROW_MSA) && defined(HAS_RGB565TOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RGB565ToUVRow = RGB565ToUVRow_Any_MSA;
+ RGB565ToYRow = RGB565ToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RGB565ToYRow = RGB565ToYRow_MSA;
+ RGB565ToUVRow = RGB565ToUVRow_MSA;
+ }
+ }
+#endif
// Other platforms do intermediate conversion from RGB565 to ARGB.
#else
#if defined(HAS_RGB565TOARGBROW_SSE2)
@@ -2086,16 +1990,9 @@ int ARGB1555ToI420(const uint8_t* src_argb1555,
}
}
}
-#elif defined(HAS_ARGB1555TOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MSA;
- ARGB1555ToYRow = ARGB1555ToYRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- ARGB1555ToYRow = ARGB1555ToYRow_MSA;
- ARGB1555ToUVRow = ARGB1555ToUVRow_MSA;
- }
- }
-#elif defined(HAS_ARGB1555TOYROW_MMI)
+// MMI and MSA version does direct ARGB1555 to YUV.
+#elif (defined(HAS_ARGB1555TOYROW_MMI) || defined(HAS_ARGB1555TOYROW_MSA))
+#if defined(HAS_ARGB1555TOYROW_MMI) && defined(HAS_ARGB1555TOUVROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MMI;
ARGB1555ToYRow = ARGB1555ToYRow_Any_MMI;
@@ -2106,6 +2003,17 @@ int ARGB1555ToI420(const uint8_t* src_argb1555,
}
}
}
+#endif
+#if defined(HAS_ARGB1555TOYROW_MSA) && defined(HAS_ARGB1555TOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MSA;
+ ARGB1555ToYRow = ARGB1555ToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB1555ToYRow = ARGB1555ToYRow_MSA;
+ ARGB1555ToUVRow = ARGB1555ToUVRow_MSA;
+ }
+ }
+#endif
// Other platforms do intermediate conversion from ARGB1555 to ARGB.
#else
#if defined(HAS_ARGB1555TOARGBROW_SSE2)
@@ -2243,7 +2151,7 @@ int ARGB4444ToI420(const uint8_t* src_argb4444,
}
}
}
-#elif defined(HAS_ARGB4444TOYROW_MMI)
+#elif defined(HAS_ARGB4444TOYROW_MMI) && defined(HAS_ARGB4444TOUVROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ARGB4444ToUVRow = ARGB4444ToUVRow_Any_MMI;
ARGB4444ToYRow = ARGB4444ToYRow_Any_MMI;
@@ -2300,19 +2208,7 @@ int ARGB4444ToI420(const uint8_t* src_argb4444,
}
}
#endif
-#if defined(HAS_ARGBTOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToUVRow = ARGBToUVRow_Any_MSA;
- ARGBToYRow = ARGBToYRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_MSA;
- if (IS_ALIGNED(width, 32)) {
- ARGBToUVRow = ARGBToUVRow_MSA;
- }
- }
- }
-#endif
-#if defined(HAS_ARGBTOYROW_MMI)
+#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ARGBToUVRow = ARGBToUVRow_Any_MMI;
ARGBToYRow = ARGBToYRow_Any_MMI;
@@ -2324,6 +2220,18 @@ int ARGB4444ToI420(const uint8_t* src_argb4444,
}
}
#endif
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToUVRow = ARGBToUVRow_Any_MSA;
+ ARGBToYRow = ARGBToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_MSA;
+ }
+ }
+ }
+#endif
#endif
{
@@ -2378,27 +2286,38 @@ int RGB24ToJ400(const uint8_t* src_rgb24,
int width,
int height) {
int y;
-#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
- defined(HAS_RGB24TOYJROW_MMI))
void (*RGB24ToYJRow)(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) =
RGB24ToYJRow_C;
-#else
- void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
- RGB24ToARGBRow_C;
- void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =
- ARGBToYJRow_C;
-#endif
if (!src_rgb24 || !dst_yj || width <= 0 || height == 0) {
return -1;
}
- // Negative height means invert the image.
if (height < 0) {
height = -height;
src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
src_stride_rgb24 = -src_stride_rgb24;
}
-
-// Neon version does direct RGB24 to YUV.
+ // Coalesce rows.
+ if (src_stride_rgb24 == width * 3 && dst_stride_yj == width) {
+ width *= height;
+ height = 1;
+ src_stride_rgb24 = dst_stride_yj = 0;
+ }
+#if defined(HAS_RGB24TOYJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RGB24ToYJRow = RGB24ToYJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToYJRow = RGB24ToYJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOYJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ RGB24ToYJRow = RGB24ToYJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ RGB24ToYJRow = RGB24ToYJRow_AVX2;
+ }
+ }
+#endif
#if defined(HAS_RGB24TOYJROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
RGB24ToYJRow = RGB24ToYJRow_Any_NEON;
@@ -2406,83 +2325,102 @@ int RGB24ToJ400(const uint8_t* src_rgb24,
RGB24ToYJRow = RGB24ToYJRow_NEON;
}
}
-#elif defined(HAS_RGB24TOYJROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- RGB24ToYJRow = RGB24ToYJRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- RGB24ToYJRow = RGB24ToYJRow_MSA;
- }
- }
-#elif defined(HAS_RGB24TOYJROW_MMI)
+#endif
+#if defined(HAS_RGB24TOYJROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
RGB24ToYJRow = RGB24ToYJRow_Any_MMI;
if (IS_ALIGNED(width, 8)) {
RGB24ToYJRow = RGB24ToYJRow_MMI;
}
}
-// Other platforms do intermediate conversion from RGB24 to ARGB.
-#else
-#if defined(HAS_RGB24TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+#endif
+#if defined(HAS_RGB24TOYJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RGB24ToYJRow = RGB24ToYJRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
- RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+ RGB24ToYJRow = RGB24ToYJRow_MSA;
}
}
#endif
-#if defined(HAS_ARGBTOYJROW_SSSE3)
+
+ for (y = 0; y < height; ++y) {
+ RGB24ToYJRow(src_rgb24, dst_yj, width);
+ src_rgb24 += src_stride_rgb24;
+ dst_yj += dst_stride_yj;
+ }
+ return 0;
+}
+
+// Convert RAW to J400.
+LIBYUV_API
+int RAWToJ400(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_yj,
+ int dst_stride_yj,
+ int width,
+ int height) {
+ int y;
+ void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_yj, int width) =
+ RAWToYJRow_C;
+ if (!src_raw || !dst_yj || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_raw = src_raw + (height - 1) * src_stride_raw;
+ src_stride_raw = -src_stride_raw;
+ }
+ // Coalesce rows.
+ if (src_stride_raw == width * 3 && dst_stride_yj == width) {
+ width *= height;
+ height = 1;
+ src_stride_raw = dst_stride_yj = 0;
+ }
+#if defined(HAS_RAWTOYJROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+ RAWToYJRow = RAWToYJRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToYJRow = ARGBToYJRow_SSSE3;
+ RAWToYJRow = RAWToYJRow_SSSE3;
}
}
#endif
-#if defined(HAS_ARGBTOYJROW_AVX2)
+#if defined(HAS_RAWTOYJROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
- ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+ RAWToYJRow = RAWToYJRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
- ARGBToYJRow = ARGBToYJRow_AVX2;
+ RAWToYJRow = RAWToYJRow_AVX2;
}
}
#endif
+#if defined(HAS_RAWTOYJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RAWToYJRow = RAWToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RAWToYJRow = RAWToYJRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOYJROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ RAWToYJRow = RAWToYJRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ RAWToYJRow = RAWToYJRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOYJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RAWToYJRow = RAWToYJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToYJRow = RAWToYJRow_MSA;
+ }
+ }
#endif
- {
-#if !(defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
- defined(HAS_RGB24TOYJROW_MMI))
- // Allocate 2 rows of ARGB.
- const int kRowSize = (width * 4 + 31) & ~31;
- align_buffer_64(row, kRowSize * 2);
-#endif
-
- for (y = 0; y < height - 1; y += 2) {
-#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
- defined(HAS_RGB24TOYJROW_MMI))
- RGB24ToYJRow(src_rgb24, dst_yj, width);
- RGB24ToYJRow(src_rgb24 + src_stride_rgb24, dst_yj + dst_stride_yj, width);
-#else
- RGB24ToARGBRow(src_rgb24, row, width);
- RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
- ARGBToYJRow(row, dst_yj, width);
- ARGBToYJRow(row + kRowSize, dst_yj + dst_stride_yj, width);
-#endif
- src_rgb24 += src_stride_rgb24 * 2;
- dst_yj += dst_stride_yj * 2;
- }
- if (height & 1) {
-#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
- defined(HAS_RGB24TOYJROW_MMI))
- RGB24ToYJRow(src_rgb24, dst_yj, width);
-#else
- RGB24ToARGBRow(src_rgb24, row, width);
- ARGBToYJRow(row, dst_yj, width);
-#endif
- }
-#if !(defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
- defined(HAS_RGB24TOYJROW_MMI))
- free_aligned_buffer_64(row);
-#endif
+ for (y = 0; y < height; ++y) {
+ RAWToYJRow(src_raw, dst_yj, width);
+ src_raw += src_stride_raw;
+ dst_yj += dst_stride_yj;
}
return 0;
}
diff --git a/TMessagesProj/jni/third_party/libyuv/source/convert_argb.cc b/TMessagesProj/jni/third_party/libyuv/source/convert_argb.cc
index 4217b1dc9..5e7225faf 100644
--- a/TMessagesProj/jni/third_party/libyuv/source/convert_argb.cc
+++ b/TMessagesProj/jni/third_party/libyuv/source/convert_argb.cc
@@ -47,18 +47,19 @@ int ARGBCopy(const uint8_t* src_argb,
return 0;
}
-// Convert I420 to ARGB with matrix
-static int I420ToARGBMatrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
+// Convert I420 to ARGB with matrix.
+LIBYUV_API
+int I420ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
int y;
void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
const uint8_t* v_buf, uint8_t* rgb_buf,
@@ -97,14 +98,6 @@ static int I420ToARGBMatrix(const uint8_t* src_y,
}
}
#endif
-#if defined(HAS_I422TOARGBROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToARGBRow = I422ToARGBRow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGBRow = I422ToARGBRow_MSA;
- }
- }
-#endif
#if defined(HAS_I422TOARGBROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
I422ToARGBRow = I422ToARGBRow_Any_MMI;
@@ -113,6 +106,14 @@ static int I420ToARGBMatrix(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_I422TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToARGBRow = I422ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
@@ -270,18 +271,19 @@ int U420ToABGR(const uint8_t* src_y,
width, height);
}
-// Convert I422 to ARGB with matrix
-static int I422ToARGBMatrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
+// Convert I422 to ARGB with matrix.
+LIBYUV_API
+int I422ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
int y;
void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
const uint8_t* v_buf, uint8_t* rgb_buf,
@@ -327,14 +329,6 @@ static int I422ToARGBMatrix(const uint8_t* src_y,
}
}
#endif
-#if defined(HAS_I422TOARGBROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToARGBRow = I422ToARGBRow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGBRow = I422ToARGBRow_MSA;
- }
- }
-#endif
#if defined(HAS_I422TOARGBROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
I422ToARGBRow = I422ToARGBRow_Any_MMI;
@@ -343,6 +337,14 @@ static int I422ToARGBMatrix(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_I422TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToARGBRow = I422ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
@@ -498,18 +500,19 @@ int U422ToABGR(const uint8_t* src_y,
width, height);
}
-// Convert I444 to ARGB with matrix
-static int I444ToARGBMatrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
+// Convert I444 to ARGB with matrix.
+LIBYUV_API
+int I444ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
int y;
void (*I444ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
const uint8_t* v_buf, uint8_t* rgb_buf,
@@ -555,14 +558,6 @@ static int I444ToARGBMatrix(const uint8_t* src_y,
}
}
#endif
-#if defined(HAS_I444TOARGBROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I444ToARGBRow = I444ToARGBRow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- I444ToARGBRow = I444ToARGBRow_MSA;
- }
- }
-#endif
#if defined(HAS_I444TOARGBROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
I444ToARGBRow = I444ToARGBRow_Any_MMI;
@@ -571,6 +566,14 @@ static int I444ToARGBMatrix(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_I444TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I444ToARGBRow = I444ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I444ToARGBRow = I444ToARGBRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
@@ -726,20 +729,21 @@ int U444ToABGR(const uint8_t* src_y,
width, height);
}
-// Convert 10 bit YUV to ARGB with matrix
+// Convert 10 bit YUV to ARGB with matrix.
// TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to
// multiply 10 bit yuv into high bits to allow any number of bits.
-static int I010ToAR30Matrix(const uint16_t* src_y,
- int src_stride_y,
- const uint16_t* src_u,
- int src_stride_u,
- const uint16_t* src_v,
- int src_stride_v,
- uint8_t* dst_ar30,
- int dst_stride_ar30,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
+LIBYUV_API
+int I010ToAR30Matrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
int y;
void (*I210ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
const uint16_t* v_buf, uint8_t* rgb_buf,
@@ -884,20 +888,21 @@ int U010ToAB30(const uint16_t* src_y,
&kYuv2020Constants, width, height);
}
-// Convert 10 bit YUV to ARGB with matrix
+// Convert 10 bit YUV to ARGB with matrix.
// TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to
// multiply 10 bit yuv into high bits to allow any number of bits.
-static int I210ToAR30Matrix(const uint16_t* src_y,
- int src_stride_y,
- const uint16_t* src_u,
- int src_stride_u,
- const uint16_t* src_v,
- int src_stride_v,
- uint8_t* dst_ar30,
- int dst_stride_ar30,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
+LIBYUV_API
+int I210ToAR30Matrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
int y;
void (*I210ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
const uint16_t* v_buf, uint8_t* rgb_buf,
@@ -1040,18 +1045,19 @@ int U210ToAB30(const uint16_t* src_y,
&kYuv2020Constants, width, height);
}
-// Convert 10 bit YUV to ARGB with matrix
-static int I010ToARGBMatrix(const uint16_t* src_y,
- int src_stride_y,
- const uint16_t* src_u,
- int src_stride_u,
- const uint16_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
+// Convert 10 bit YUV to ARGB with matrix.
+LIBYUV_API
+int I010ToARGBMatrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
int y;
void (*I210ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
const uint16_t* v_buf, uint8_t* rgb_buf,
@@ -1210,18 +1216,19 @@ int U010ToABGR(const uint16_t* src_y,
width, height);
}
-// Convert 10 bit 422 YUV to ARGB with matrix
-static int I210ToARGBMatrix(const uint16_t* src_y,
- int src_stride_y,
- const uint16_t* src_u,
- int src_stride_u,
- const uint16_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
+// Convert 10 bit 422 YUV to ARGB with matrix.
+LIBYUV_API
+int I210ToARGBMatrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
int y;
void (*I210ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
const uint16_t* v_buf, uint8_t* rgb_buf,
@@ -1270,9 +1277,6 @@ static int I210ToARGBMatrix(const uint16_t* src_y,
return 0;
}
-
-
-
// Convert I210 to ARGB.
LIBYUV_API
int I210ToARGB(const uint16_t* src_y,
@@ -1381,21 +1385,22 @@ int U210ToABGR(const uint16_t* src_y,
width, height);
}
-// Convert I420 with Alpha to preattenuated ARGB.
-static int I420AlphaToARGBMatrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- const uint8_t* src_a,
- int src_stride_a,
- uint8_t* dst_argb,
- int dst_stride_argb,
- const struct YuvConstants* yuvconstants,
- int width,
- int height,
- int attenuate) {
+// Convert I420 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I420AlphaToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate) {
int y;
void (*I422AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
const uint8_t* v_buf, const uint8_t* a_buf,
@@ -1437,14 +1442,6 @@ static int I420AlphaToARGBMatrix(const uint8_t* src_y,
}
}
#endif
-#if defined(HAS_I422ALPHATOARGBROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- I422AlphaToARGBRow = I422AlphaToARGBRow_MSA;
- }
- }
-#endif
#if defined(HAS_I422ALPHATOARGBROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MMI;
@@ -1453,6 +1450,14 @@ static int I420AlphaToARGBMatrix(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_I422ALPHATOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_MSA;
+ }
+ }
+#endif
#if defined(HAS_ARGBATTENUATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
@@ -1477,14 +1482,6 @@ static int I420AlphaToARGBMatrix(const uint8_t* src_y,
}
}
#endif
-#if defined(HAS_ARGBATTENUATEROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- ARGBAttenuateRow = ARGBAttenuateRow_MSA;
- }
- }
-#endif
#if defined(HAS_ARGBATTENUATEROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI;
@@ -1493,6 +1490,14 @@ static int I420AlphaToARGBMatrix(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
@@ -1554,16 +1559,18 @@ int I420AlphaToABGR(const uint8_t* src_y,
width, height, attenuate);
}
-// Convert I400 to ARGB.
+// Convert I400 to ARGB with matrix.
LIBYUV_API
-int I400ToARGB(const uint8_t* src_y,
- int src_stride_y,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
+int I400ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
int y;
- void (*I400ToARGBRow)(const uint8_t* y_buf, uint8_t* rgb_buf, int width) =
+ void (*I400ToARGBRow)(const uint8_t* y_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
I400ToARGBRow_C;
if (!src_y || !dst_argb || width <= 0 || height == 0) {
return -1;
@@ -1604,14 +1611,6 @@ int I400ToARGB(const uint8_t* src_y,
}
}
#endif
-#if defined(HAS_I400TOARGBROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I400ToARGBRow = I400ToARGBRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- I400ToARGBRow = I400ToARGBRow_MSA;
- }
- }
-#endif
#if defined(HAS_I400TOARGBROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
I400ToARGBRow = I400ToARGBRow_Any_MMI;
@@ -1620,15 +1619,35 @@ int I400ToARGB(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_I400TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I400ToARGBRow = I400ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ I400ToARGBRow = I400ToARGBRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
- I400ToARGBRow(src_y, dst_argb, width);
+ I400ToARGBRow(src_y, dst_argb, yuvconstants, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
}
return 0;
}
+// Convert I400 to ARGB.
+LIBYUV_API
+int I400ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I400ToARGBMatrix(src_y, src_stride_y, dst_argb, dst_stride_argb,
+ &kYuvI601Constants, width, height);
+}
+
// Convert J400 to ARGB.
LIBYUV_API
int J400ToARGB(const uint8_t* src_y,
@@ -1679,14 +1698,6 @@ int J400ToARGB(const uint8_t* src_y,
}
}
#endif
-#if defined(HAS_J400TOARGBROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- J400ToARGBRow = J400ToARGBRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- J400ToARGBRow = J400ToARGBRow_MSA;
- }
- }
-#endif
#if defined(HAS_J400TOARGBROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
J400ToARGBRow = J400ToARGBRow_Any_MMI;
@@ -1694,6 +1705,14 @@ int J400ToARGB(const uint8_t* src_y,
J400ToARGBRow = J400ToARGBRow_MMI;
}
}
+#endif
+#if defined(HAS_J400TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ J400ToARGBRow = J400ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ J400ToARGBRow = J400ToARGBRow_MSA;
+ }
+ }
#endif
for (y = 0; y < height; ++y) {
J400ToARGBRow(src_y, dst_argb, width);
@@ -1817,14 +1836,6 @@ int RGB24ToARGB(const uint8_t* src_rgb24,
}
}
#endif
-#if defined(HAS_RGB24TOARGBROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- RGB24ToARGBRow = RGB24ToARGBRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- RGB24ToARGBRow = RGB24ToARGBRow_MSA;
- }
- }
-#endif
#if defined(HAS_RGB24TOARGBROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
RGB24ToARGBRow = RGB24ToARGBRow_Any_MMI;
@@ -1833,6 +1844,14 @@ int RGB24ToARGB(const uint8_t* src_rgb24,
}
}
#endif
+#if defined(HAS_RGB24TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
RGB24ToARGBRow(src_rgb24, dst_argb, width);
@@ -1884,14 +1903,6 @@ int RAWToARGB(const uint8_t* src_raw,
}
}
#endif
-#if defined(HAS_RAWTOARGBROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- RAWToARGBRow = RAWToARGBRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- RAWToARGBRow = RAWToARGBRow_MSA;
- }
- }
-#endif
#if defined(HAS_RAWTOARGBROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
RAWToARGBRow = RAWToARGBRow_Any_MMI;
@@ -1900,6 +1911,14 @@ int RAWToARGB(const uint8_t* src_raw,
}
}
#endif
+#if defined(HAS_RAWTOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RAWToARGBRow = RAWToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToARGBRow = RAWToARGBRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
RAWToARGBRow(src_raw, dst_argb, width);
@@ -2010,14 +2029,6 @@ int RGB565ToARGB(const uint8_t* src_rgb565,
}
}
#endif
-#if defined(HAS_RGB565TOARGBROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- RGB565ToARGBRow = RGB565ToARGBRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- RGB565ToARGBRow = RGB565ToARGBRow_MSA;
- }
- }
-#endif
#if defined(HAS_RGB565TOARGBROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
RGB565ToARGBRow = RGB565ToARGBRow_Any_MMI;
@@ -2026,6 +2037,14 @@ int RGB565ToARGB(const uint8_t* src_rgb565,
}
}
#endif
+#if defined(HAS_RGB565TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
RGB565ToARGBRow(src_rgb565, dst_argb, width);
@@ -2085,14 +2104,6 @@ int ARGB1555ToARGB(const uint8_t* src_argb1555,
}
}
#endif
-#if defined(HAS_ARGB1555TOARGBROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- ARGB1555ToARGBRow = ARGB1555ToARGBRow_MSA;
- }
- }
-#endif
#if defined(HAS_ARGB1555TOARGBROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MMI;
@@ -2101,6 +2112,14 @@ int ARGB1555ToARGB(const uint8_t* src_argb1555,
}
}
#endif
+#if defined(HAS_ARGB1555TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGB1555ToARGBRow(src_argb1555, dst_argb, width);
@@ -2160,14 +2179,6 @@ int ARGB4444ToARGB(const uint8_t* src_argb4444,
}
}
#endif
-#if defined(HAS_ARGB4444TOARGBROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA;
- }
- }
-#endif
#if defined(HAS_ARGB4444TOARGBROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MMI;
@@ -2176,6 +2187,14 @@ int ARGB4444ToARGB(const uint8_t* src_argb4444,
}
}
#endif
+#if defined(HAS_ARGB4444TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGB4444ToARGBRow(src_argb4444, dst_argb, width);
@@ -2281,16 +2300,17 @@ int AR30ToAB30(const uint8_t* src_ar30,
return 0;
}
-// Convert NV12 to ARGB with matrix
-static int NV12ToARGBMatrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_uv,
- int src_stride_uv,
- uint8_t* dst_argb,
- int dst_stride_argb,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
+// Convert NV12 to ARGB with matrix.
+LIBYUV_API
+int NV12ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
int y;
void (*NV12ToARGBRow)(
const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
@@ -2328,14 +2348,6 @@ static int NV12ToARGBMatrix(const uint8_t* src_y,
}
}
#endif
-#if defined(HAS_NV12TOARGBROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- NV12ToARGBRow = NV12ToARGBRow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- NV12ToARGBRow = NV12ToARGBRow_MSA;
- }
- }
-#endif
#if defined(HAS_NV12TOARGBROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
NV12ToARGBRow = NV12ToARGBRow_Any_MMI;
@@ -2344,6 +2356,14 @@ static int NV12ToARGBMatrix(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_NV12TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToARGBRow = NV12ToARGBRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
NV12ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width);
@@ -2356,16 +2376,17 @@ static int NV12ToARGBMatrix(const uint8_t* src_y,
return 0;
}
-// Convert NV21 to ARGB with matrix
-static int NV21ToARGBMatrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_vu,
- int src_stride_vu,
- uint8_t* dst_argb,
- int dst_stride_argb,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
+// Convert NV21 to ARGB with matrix.
+LIBYUV_API
+int NV21ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
int y;
void (*NV21ToARGBRow)(
const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
@@ -2403,14 +2424,6 @@ static int NV21ToARGBMatrix(const uint8_t* src_y,
}
}
#endif
-#if defined(HAS_NV21TOARGBROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- NV21ToARGBRow = NV21ToARGBRow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- NV21ToARGBRow = NV21ToARGBRow_MSA;
- }
- }
-#endif
#if defined(HAS_NV21TOARGBROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
NV21ToARGBRow = NV21ToARGBRow_Any_MMI;
@@ -2419,6 +2432,14 @@ static int NV21ToARGBMatrix(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_NV21TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ NV21ToARGBRow = NV21ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ NV21ToARGBRow = NV21ToARGBRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
NV21ToARGBRow(src_y, src_vu, dst_argb, yuvconstants, width);
@@ -2490,16 +2511,17 @@ int NV21ToABGR(const uint8_t* src_y,
}
// TODO(fbarchard): Consider SSSE3 2 step conversion.
-// Convert NV12 to RGB24 with matrix
-static int NV12ToRGB24Matrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_uv,
- int src_stride_uv,
- uint8_t* dst_rgb24,
- int dst_stride_rgb24,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
+// Convert NV12 to RGB24 with matrix.
+LIBYUV_API
+int NV12ToRGB24Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
int y;
void (*NV12ToRGB24Row)(
const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
@@ -2557,16 +2579,17 @@ static int NV12ToRGB24Matrix(const uint8_t* src_y,
return 0;
}
-// Convert NV21 to RGB24 with matrix
-static int NV21ToRGB24Matrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_vu,
- int src_stride_vu,
- uint8_t* dst_rgb24,
- int dst_stride_rgb24,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
+// Convert NV21 to RGB24 with matrix.
+LIBYUV_API
+int NV21ToRGB24Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
int y;
void (*NV21ToRGB24Row)(
const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
@@ -2730,83 +2753,6 @@ int NV21ToYUV24(const uint8_t* src_y,
return 0;
}
-// Convert M420 to ARGB.
-LIBYUV_API
-int M420ToARGB(const uint8_t* src_m420,
- int src_stride_m420,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
- int y;
- void (*NV12ToARGBRow)(
- const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C;
- if (!src_m420 || !dst_argb || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_argb = dst_argb + (height - 1) * dst_stride_argb;
- dst_stride_argb = -dst_stride_argb;
- }
-#if defined(HAS_NV12TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- NV12ToARGBRow = NV12ToARGBRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_NV12TOARGBROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- NV12ToARGBRow = NV12ToARGBRow_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- NV12ToARGBRow = NV12ToARGBRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_NV12TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- NV12ToARGBRow = NV12ToARGBRow_NEON;
- }
- }
-#endif
-#if defined(HAS_NV12TOARGBROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- NV12ToARGBRow = NV12ToARGBRow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- NV12ToARGBRow = NV12ToARGBRow_MSA;
- }
- }
-#endif
-#if defined(HAS_NV12TOARGBROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- NV12ToARGBRow = NV12ToARGBRow_Any_MMI;
- if (IS_ALIGNED(width, 4)) {
- NV12ToARGBRow = NV12ToARGBRow_MMI;
- }
- }
-#endif
-
- for (y = 0; y < height - 1; y += 2) {
- NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb,
- &kYuvI601Constants, width);
- NV12ToARGBRow(src_m420 + src_stride_m420, src_m420 + src_stride_m420 * 2,
- dst_argb + dst_stride_argb, &kYuvI601Constants, width);
- dst_argb += dst_stride_argb * 2;
- src_m420 += src_stride_m420 * 3;
- }
- if (height & 1) {
- NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb,
- &kYuvI601Constants, width);
- }
- return 0;
-}
-
// Convert YUY2 to ARGB.
LIBYUV_API
int YUY2ToARGB(const uint8_t* src_yuy2,
@@ -2858,14 +2804,6 @@ int YUY2ToARGB(const uint8_t* src_yuy2,
}
}
#endif
-#if defined(HAS_YUY2TOARGBROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- YUY2ToARGBRow = YUY2ToARGBRow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- YUY2ToARGBRow = YUY2ToARGBRow_MSA;
- }
- }
-#endif
#if defined(HAS_YUY2TOARGBROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
YUY2ToARGBRow = YUY2ToARGBRow_Any_MMI;
@@ -2873,6 +2811,14 @@ int YUY2ToARGB(const uint8_t* src_yuy2,
YUY2ToARGBRow = YUY2ToARGBRow_MMI;
}
}
+#endif
+#if defined(HAS_YUY2TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_MSA;
+ }
+ }
#endif
for (y = 0; y < height; ++y) {
YUY2ToARGBRow(src_yuy2, dst_argb, &kYuvI601Constants, width);
@@ -2933,14 +2879,6 @@ int UYVYToARGB(const uint8_t* src_uyvy,
}
}
#endif
-#if defined(HAS_UYVYTOARGBROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- UYVYToARGBRow = UYVYToARGBRow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- UYVYToARGBRow = UYVYToARGBRow_MSA;
- }
- }
-#endif
#if defined(HAS_UYVYTOARGBROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
UYVYToARGBRow = UYVYToARGBRow_Any_MMI;
@@ -2948,6 +2886,14 @@ int UYVYToARGB(const uint8_t* src_uyvy,
UYVYToARGBRow = UYVYToARGBRow_MMI;
}
}
+#endif
+#if defined(HAS_UYVYTOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ UYVYToARGBRow = UYVYToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ UYVYToARGBRow = UYVYToARGBRow_MSA;
+ }
+ }
#endif
for (y = 0; y < height; ++y) {
UYVYToARGBRow(src_uyvy, dst_argb, &kYuvI601Constants, width);
@@ -2971,7 +2917,7 @@ static void WeavePixels(const uint8_t* src_u,
}
}
-// Convert Android420 to ARGB.
+// Convert Android420 to ARGB with matrix.
LIBYUV_API
int Android420ToARGBMatrix(const uint8_t* src_y,
int src_stride_y,
@@ -3072,6 +3018,1107 @@ int Android420ToABGR(const uint8_t* src_y,
height);
}
+// Convert I422 to RGBA with matrix.
+LIBYUV_API
+int I422ToRGBAMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToRGBARow_C;
+ if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+ dst_stride_rgba = -dst_stride_rgba;
+ }
+#if defined(HAS_I422TORGBAROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToRGBARow = I422ToRGBARow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGBARow = I422ToRGBARow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToRGBARow = I422ToRGBARow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422ToRGBARow = I422ToRGBARow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ I422ToRGBARow = I422ToRGBARow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToRGBARow = I422ToRGBARow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
+ dst_rgba += dst_stride_rgba;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert I422 to RGBA.
+LIBYUV_API
+int I422ToRGBA(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ int width,
+ int height) {
+ return I422ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgba, dst_stride_rgba,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert I422 to BGRA.
+LIBYUV_API
+int I422ToBGRA(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_bgra,
+ int dst_stride_bgra,
+ int width,
+ int height) {
+ return I422ToRGBAMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_bgra, dst_stride_bgra,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert NV12 to RGB565 with matrix.
+LIBYUV_API
+int NV12ToRGB565Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*NV12ToRGB565Row)(
+ const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) = NV12ToRGB565Row_C;
+ if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+ dst_stride_rgb565 = -dst_stride_rgb565;
+ }
+#if defined(HAS_NV12TORGB565ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_NV12TORGB565ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_NV12TORGB565ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_NV12TORGB565ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_MMI;
+ }
+ }
+#endif
+#if defined(HAS_NV12TORGB565ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ NV12ToRGB565Row(src_y, src_uv, dst_rgb565, yuvconstants, width);
+ dst_rgb565 += dst_stride_rgb565;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_uv += src_stride_uv;
+ }
+ }
+ return 0;
+}
+
+// Convert NV12 to RGB565.
+LIBYUV_API
+int NV12ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height) {
+ return NV12ToRGB565Matrix(src_y, src_stride_y, src_uv, src_stride_uv,
+ dst_rgb565, dst_stride_rgb565, &kYuvI601Constants,
+ width, height);
+}
+
+// Convert I422 to RGBA with matrix.
+LIBYUV_API
+int I420ToRGBAMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToRGBARow_C;
+ if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+ dst_stride_rgba = -dst_stride_rgba;
+ }
+#if defined(HAS_I422TORGBAROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToRGBARow = I422ToRGBARow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGBARow = I422ToRGBARow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToRGBARow = I422ToRGBARow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422ToRGBARow = I422ToRGBARow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ I422ToRGBARow = I422ToRGBARow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToRGBARow = I422ToRGBARow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
+ dst_rgba += dst_stride_rgba;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to RGBA.
+LIBYUV_API
+int I420ToRGBA(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ int width,
+ int height) {
+ return I420ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgba, dst_stride_rgba,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert I420 to BGRA.
+LIBYUV_API
+int I420ToBGRA(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_bgra,
+ int dst_stride_bgra,
+ int width,
+ int height) {
+ return I420ToRGBAMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_bgra, dst_stride_bgra,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert I420 to RGB24 with matrix.
+LIBYUV_API
+int I420ToRGB24Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToRGB24Row_C;
+ if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+ dst_stride_rgb24 = -dst_stride_rgb24;
+ }
+#if defined(HAS_I422TORGB24ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGB24Row = I422ToRGB24Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB24ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToRGB24Row = I422ToRGB24Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB24ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB24Row = I422ToRGB24Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB24ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ I422ToRGB24Row = I422ToRGB24Row_MMI;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB24ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGB24Row = I422ToRGB24Row_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
+ dst_rgb24 += dst_stride_rgb24;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to RGB24.
+LIBYUV_API
+int I420ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height) {
+ return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgb24, dst_stride_rgb24,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert I420 to RAW.
+LIBYUV_API
+int I420ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height) {
+ return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_raw, dst_stride_raw,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert J420 to RGB24.
+LIBYUV_API
+int J420ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height) {
+ return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgb24, dst_stride_rgb24,
+ &kYuvJPEGConstants, width, height);
+}
+
+// Convert J420 to RAW.
+LIBYUV_API
+int J420ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height) {
+ return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_raw, dst_stride_raw,
+ &kYvuJPEGConstants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert H420 to RGB24.
+LIBYUV_API
+int H420ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height) {
+ return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgb24, dst_stride_rgb24,
+ &kYuvH709Constants, width, height);
+}
+
+// Convert H420 to RAW.
+LIBYUV_API
+int H420ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height) {
+ return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_raw, dst_stride_raw,
+ &kYvuH709Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert I420 to ARGB1555.
+LIBYUV_API
+int I420ToARGB1555(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb1555,
+ int dst_stride_argb1555,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToARGB1555Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) = I422ToARGB1555Row_C;
+ if (!src_y || !src_u || !src_v || !dst_argb1555 || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555;
+ dst_stride_argb1555 = -dst_stride_argb1555;
+ }
+#if defined(HAS_I422TOARGB1555ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_MMI;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants,
+ width);
+ dst_argb1555 += dst_stride_argb1555;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to ARGB4444.
+LIBYUV_API
+int I420ToARGB4444(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb4444,
+ int dst_stride_argb4444,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToARGB4444Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) = I422ToARGB4444Row_C;
+ if (!src_y || !src_u || !src_v || !dst_argb4444 || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444;
+ dst_stride_argb4444 = -dst_stride_argb4444;
+ }
+#if defined(HAS_I422TOARGB4444ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_MMI;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants,
+ width);
+ dst_argb4444 += dst_stride_argb4444;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to RGB565 with specified color matrix.
+LIBYUV_API
+int I420ToRGB565Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToRGB565Row_C;
+ if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+ dst_stride_rgb565 = -dst_stride_rgb565;
+ }
+#if defined(HAS_I422TORGB565ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGB565Row = I422ToRGB565Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ I422ToRGB565Row = I422ToRGB565Row_MMI;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, yuvconstants, width);
+ dst_rgb565 += dst_stride_rgb565;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to RGB565.
+LIBYUV_API
+int I420ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height) {
+ return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgb565, dst_stride_rgb565,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert J420 to RGB565.
+LIBYUV_API
+int J420ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height) {
+ return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgb565, dst_stride_rgb565,
+ &kYuvJPEGConstants, width, height);
+}
+
+// Convert H420 to RGB565.
+LIBYUV_API
+int H420ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height) {
+ return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgb565, dst_stride_rgb565,
+ &kYuvH709Constants, width, height);
+}
+
+// Convert I422 to RGB565.
+LIBYUV_API
+int I422ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToRGB565Row_C;
+ if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+ dst_stride_rgb565 = -dst_stride_rgb565;
+ }
+#if defined(HAS_I422TORGB565ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGB565Row = I422ToRGB565Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width);
+ dst_rgb565 += dst_stride_rgb565;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Ordered 8x8 dither for 888 to 565. Values from 0 to 7.
+static const uint8_t kDither565_4x4[16] = {
+ 0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
+};
+
+// Convert I420 to RGB565 with dithering.
+LIBYUV_API
+int I420ToRGB565Dither(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ const uint8_t* dither4x4,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToARGBRow_C;
+ void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb,
+ const uint32_t dither4, int width) =
+ ARGBToRGB565DitherRow_C;
+ if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+ dst_stride_rgb565 = -dst_stride_rgb565;
+ }
+ if (!dither4x4) {
+ dither4x4 = kDither565_4x4;
+ }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGBRow = I422ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToARGBRow = I422ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422ToARGBRow = I422ToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ I422ToARGBRow = I422ToARGBRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToARGBRow = I422ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA;
+ }
+ }
+#endif
+ {
+ // Allocate a row of argb.
+ align_buffer_64(row_argb, width * 4);
+ for (y = 0; y < height; ++y) {
+ I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width);
+ ARGBToRGB565DitherRow(row_argb, dst_rgb565,
+ *(const uint32_t*)(dither4x4 + ((y & 3) << 2)),
+ width);
+ dst_rgb565 += dst_stride_rgb565;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ free_aligned_buffer_64(row_argb);
+ }
+ return 0;
+}
+
+// Convert I420 to AR30 with matrix.
+LIBYUV_API
+int I420ToAR30Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToAR30Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToAR30Row_C;
+
+ if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+ dst_stride_ar30 = -dst_stride_ar30;
+ }
+
+#if defined(HAS_I422TOAR30ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToAR30Row = I422ToAR30Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToAR30Row = I422ToAR30Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOAR30ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToAR30Row = I422ToAR30Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToAR30Row = I422ToAR30Row_AVX2;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
+ dst_ar30 += dst_stride_ar30;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to AR30.
+LIBYUV_API
+int I420ToAR30(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height) {
+ return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_ar30, dst_stride_ar30,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert H420 to AR30.
+LIBYUV_API
+int H420ToAR30(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height) {
+ return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_ar30, dst_stride_ar30,
+ &kYvuH709Constants, width, height);
+}
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/TMessagesProj/jni/third_party/libyuv/source/convert_from.cc b/TMessagesProj/jni/third_party/libyuv/source/convert_from.cc
index 0c95f1f29..f2cfc1d8f 100644
--- a/TMessagesProj/jni/third_party/libyuv/source/convert_from.cc
+++ b/TMessagesProj/jni/third_party/libyuv/source/convert_from.cc
@@ -294,14 +294,6 @@ int I420ToYUY2(const uint8_t* src_y,
}
}
#endif
-#if defined(HAS_I422TOYUY2ROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToYUY2Row = I422ToYUY2Row_Any_MSA;
- if (IS_ALIGNED(width, 32)) {
- I422ToYUY2Row = I422ToYUY2Row_MSA;
- }
- }
-#endif
#if defined(HAS_I422TOYUY2ROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
I422ToYUY2Row = I422ToYUY2Row_Any_MMI;
@@ -310,6 +302,14 @@ int I420ToYUY2(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_I422TOYUY2ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToYUY2Row = I422ToYUY2Row_MSA;
+ }
+ }
+#endif
for (y = 0; y < height - 1; y += 2) {
I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
@@ -381,14 +381,6 @@ int I422ToUYVY(const uint8_t* src_y,
}
}
#endif
-#if defined(HAS_I422TOUYVYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
- if (IS_ALIGNED(width, 32)) {
- I422ToUYVYRow = I422ToUYVYRow_MSA;
- }
- }
-#endif
#if defined(HAS_I422TOUYVYROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
I422ToUYVYRow = I422ToUYVYRow_Any_MMI;
@@ -397,6 +389,14 @@ int I422ToUYVY(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_I422TOUYVYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToUYVYRow = I422ToUYVYRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
@@ -456,14 +456,6 @@ int I420ToUYVY(const uint8_t* src_y,
}
}
#endif
-#if defined(HAS_I422TOUYVYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
- if (IS_ALIGNED(width, 32)) {
- I422ToUYVYRow = I422ToUYVYRow_MSA;
- }
- }
-#endif
#if defined(HAS_I422TOUYVYROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
I422ToUYVYRow = I422ToUYVYRow_Any_MMI;
@@ -472,6 +464,14 @@ int I420ToUYVY(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_I422TOUYVYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToUYVYRow = I422ToUYVYRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height - 1; y += 2) {
I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
@@ -488,7 +488,6 @@ int I420ToUYVY(const uint8_t* src_y,
return 0;
}
-// TODO(fbarchard): test negative height for invert.
LIBYUV_API
int I420ToNV12(const uint8_t* src_y,
int src_stride_y,
@@ -502,12 +501,23 @@ int I420ToNV12(const uint8_t* src_y,
int dst_stride_uv,
int width,
int height) {
+ int halfwidth = (width + 1) / 2;
+ int halfheight = (height + 1) / 2;
if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 ||
height == 0) {
return -1;
}
- int halfwidth = (width + 1) / 2;
- int halfheight = height > 0 ? (height + 1) / 2 : (height - 1) / 2;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
if (dst_y) {
CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
}
@@ -534,899 +544,6 @@ int I420ToNV21(const uint8_t* src_y,
width, height);
}
-// Convert I422 to RGBA with matrix
-static int I420ToRGBAMatrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgba,
- int dst_stride_rgba,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
- int y;
- void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf,
- const uint8_t* v_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) =
- I422ToRGBARow_C;
- if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
- dst_stride_rgba = -dst_stride_rgba;
- }
-#if defined(HAS_I422TORGBAROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGBARow = I422ToRGBARow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I422TORGBAROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToRGBARow = I422ToRGBARow_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I422ToRGBARow = I422ToRGBARow_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TORGBAROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToRGBARow = I422ToRGBARow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGBARow = I422ToRGBARow_NEON;
- }
- }
-#endif
-#if defined(HAS_I422TORGBAROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToRGBARow = I422ToRGBARow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGBARow = I422ToRGBARow_MSA;
- }
- }
-#endif
-#if defined(HAS_I422TORGBAROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- I422ToRGBARow = I422ToRGBARow_Any_MMI;
- if (IS_ALIGNED(width, 4)) {
- I422ToRGBARow = I422ToRGBARow_MMI;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
- dst_rgba += dst_stride_rgba;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- return 0;
-}
-
-// Convert I420 to RGBA.
-LIBYUV_API
-int I420ToRGBA(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgba,
- int dst_stride_rgba,
- int width,
- int height) {
- return I420ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_rgba, dst_stride_rgba,
- &kYuvI601Constants, width, height);
-}
-
-// Convert I420 to BGRA.
-LIBYUV_API
-int I420ToBGRA(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_bgra,
- int dst_stride_bgra,
- int width,
- int height) {
- return I420ToRGBAMatrix(src_y, src_stride_y, src_v,
- src_stride_v, // Swap U and V
- src_u, src_stride_u, dst_bgra, dst_stride_bgra,
- &kYvuI601Constants, // Use Yvu matrix
- width, height);
-}
-
-// Convert I420 to RGB24 with matrix
-static int I420ToRGB24Matrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb24,
- int dst_stride_rgb24,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
- int y;
- void (*I422ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf,
- const uint8_t* v_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) =
- I422ToRGB24Row_C;
- if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
- dst_stride_rgb24 = -dst_stride_rgb24;
- }
-#if defined(HAS_I422TORGB24ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- I422ToRGB24Row = I422ToRGB24Row_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I422TORGB24ROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToRGB24Row = I422ToRGB24Row_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- I422ToRGB24Row = I422ToRGB24Row_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TORGB24ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGB24Row = I422ToRGB24Row_NEON;
- }
- }
-#endif
-#if defined(HAS_I422TORGB24ROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToRGB24Row = I422ToRGB24Row_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- I422ToRGB24Row = I422ToRGB24Row_MSA;
- }
- }
-#endif
-#if defined(HAS_I422TORGB24ROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- I422ToRGB24Row = I422ToRGB24Row_Any_MMI;
- if (IS_ALIGNED(width, 4)) {
- I422ToRGB24Row = I422ToRGB24Row_MMI;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
- dst_rgb24 += dst_stride_rgb24;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- return 0;
-}
-
-// Convert I420 to RGB24.
-LIBYUV_API
-int I420ToRGB24(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb24,
- int dst_stride_rgb24,
- int width,
- int height) {
- return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_rgb24, dst_stride_rgb24,
- &kYuvI601Constants, width, height);
-}
-
-// Convert I420 to RAW.
-LIBYUV_API
-int I420ToRAW(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_raw,
- int dst_stride_raw,
- int width,
- int height) {
- return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
- src_stride_v, // Swap U and V
- src_u, src_stride_u, dst_raw, dst_stride_raw,
- &kYvuI601Constants, // Use Yvu matrix
- width, height);
-}
-
-// Convert J420 to RGB24.
-LIBYUV_API
-int J420ToRGB24(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb24,
- int dst_stride_rgb24,
- int width,
- int height) {
- return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_rgb24, dst_stride_rgb24,
- &kYuvJPEGConstants, width, height);
-}
-
-// Convert J420 to RAW.
-LIBYUV_API
-int J420ToRAW(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_raw,
- int dst_stride_raw,
- int width,
- int height) {
- return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
- src_stride_v, // Swap U and V
- src_u, src_stride_u, dst_raw, dst_stride_raw,
- &kYvuJPEGConstants, // Use Yvu matrix
- width, height);
-}
-
-// Convert H420 to RGB24.
-LIBYUV_API
-int H420ToRGB24(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb24,
- int dst_stride_rgb24,
- int width,
- int height) {
- return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_rgb24, dst_stride_rgb24,
- &kYuvH709Constants, width, height);
-}
-
-// Convert H420 to RAW.
-LIBYUV_API
-int H420ToRAW(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_raw,
- int dst_stride_raw,
- int width,
- int height) {
- return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
- src_stride_v, // Swap U and V
- src_u, src_stride_u, dst_raw, dst_stride_raw,
- &kYvuH709Constants, // Use Yvu matrix
- width, height);
-}
-
-// Convert I420 to ARGB1555.
-LIBYUV_API
-int I420ToARGB1555(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb1555,
- int dst_stride_argb1555,
- int width,
- int height) {
- int y;
- void (*I422ToARGB1555Row)(const uint8_t* y_buf, const uint8_t* u_buf,
- const uint8_t* v_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants,
- int width) = I422ToARGB1555Row_C;
- if (!src_y || !src_u || !src_v || !dst_argb1555 || width <= 0 ||
- height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555;
- dst_stride_argb1555 = -dst_stride_argb1555;
- }
-#if defined(HAS_I422TOARGB1555ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGB1555Row = I422ToARGB1555Row_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I422TOARGB1555ROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I422ToARGB1555Row = I422ToARGB1555Row_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TOARGB1555ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGB1555Row = I422ToARGB1555Row_NEON;
- }
- }
-#endif
-#if defined(HAS_I422TOARGB1555ROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToARGB1555Row = I422ToARGB1555Row_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGB1555Row = I422ToARGB1555Row_MSA;
- }
- }
-#endif
-#if defined(HAS_I422TOARGB1555ROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- I422ToARGB1555Row = I422ToARGB1555Row_Any_MMI;
- if (IS_ALIGNED(width, 4)) {
- I422ToARGB1555Row = I422ToARGB1555Row_MMI;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants,
- width);
- dst_argb1555 += dst_stride_argb1555;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- return 0;
-}
-
-// Convert I420 to ARGB4444.
-LIBYUV_API
-int I420ToARGB4444(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb4444,
- int dst_stride_argb4444,
- int width,
- int height) {
- int y;
- void (*I422ToARGB4444Row)(const uint8_t* y_buf, const uint8_t* u_buf,
- const uint8_t* v_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants,
- int width) = I422ToARGB4444Row_C;
- if (!src_y || !src_u || !src_v || !dst_argb4444 || width <= 0 ||
- height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444;
- dst_stride_argb4444 = -dst_stride_argb4444;
- }
-#if defined(HAS_I422TOARGB4444ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGB4444Row = I422ToARGB4444Row_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I422TOARGB4444ROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToARGB4444Row = I422ToARGB4444Row_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I422ToARGB4444Row = I422ToARGB4444Row_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TOARGB4444ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGB4444Row = I422ToARGB4444Row_NEON;
- }
- }
-#endif
-#if defined(HAS_I422TOARGB4444ROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToARGB4444Row = I422ToARGB4444Row_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGB4444Row = I422ToARGB4444Row_MSA;
- }
- }
-#endif
-#if defined(HAS_I422TOARGB4444ROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- I422ToARGB4444Row = I422ToARGB4444Row_Any_MMI;
- if (IS_ALIGNED(width, 4)) {
- I422ToARGB4444Row = I422ToARGB4444Row_MMI;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants,
- width);
- dst_argb4444 += dst_stride_argb4444;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- return 0;
-}
-
-// Convert I420 to RGB565 with specified color matrix.
-LIBYUV_API
-int I420ToRGB565Matrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb565,
- int dst_stride_rgb565,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
- int y;
- void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf,
- const uint8_t* v_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) =
- I422ToRGB565Row_C;
- if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
- dst_stride_rgb565 = -dst_stride_rgb565;
- }
-#if defined(HAS_I422TORGB565ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGB565Row = I422ToRGB565Row_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I422TORGB565ROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I422ToRGB565Row = I422ToRGB565Row_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TORGB565ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGB565Row = I422ToRGB565Row_NEON;
- }
- }
-#endif
-#if defined(HAS_I422TORGB565ROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGB565Row = I422ToRGB565Row_MSA;
- }
- }
-#endif
-#if defined(HAS_I422TORGB565ROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- I422ToRGB565Row = I422ToRGB565Row_Any_MMI;
- if (IS_ALIGNED(width, 4)) {
- I422ToRGB565Row = I422ToRGB565Row_MMI;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, yuvconstants, width);
- dst_rgb565 += dst_stride_rgb565;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- return 0;
-}
-
-// Convert I420 to RGB565.
-LIBYUV_API
-int I420ToRGB565(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb565,
- int dst_stride_rgb565,
- int width,
- int height) {
- return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_rgb565, dst_stride_rgb565,
- &kYuvI601Constants, width, height);
-}
-
-// Convert J420 to RGB565.
-LIBYUV_API
-int J420ToRGB565(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb565,
- int dst_stride_rgb565,
- int width,
- int height) {
- return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_rgb565, dst_stride_rgb565,
- &kYuvJPEGConstants, width, height);
-}
-
-// Convert H420 to RGB565.
-LIBYUV_API
-int H420ToRGB565(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb565,
- int dst_stride_rgb565,
- int width,
- int height) {
- return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_rgb565, dst_stride_rgb565,
- &kYuvH709Constants, width, height);
-}
-
-// Convert I422 to RGB565.
-LIBYUV_API
-int I422ToRGB565(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb565,
- int dst_stride_rgb565,
- int width,
- int height) {
- int y;
- void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf,
- const uint8_t* v_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) =
- I422ToRGB565Row_C;
- if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
- dst_stride_rgb565 = -dst_stride_rgb565;
- }
-#if defined(HAS_I422TORGB565ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGB565Row = I422ToRGB565Row_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I422TORGB565ROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I422ToRGB565Row = I422ToRGB565Row_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TORGB565ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGB565Row = I422ToRGB565Row_NEON;
- }
- }
-#endif
-#if defined(HAS_I422TORGB565ROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGB565Row = I422ToRGB565Row_MSA;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width);
- dst_rgb565 += dst_stride_rgb565;
- src_y += src_stride_y;
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- return 0;
-}
-
-// Ordered 8x8 dither for 888 to 565. Values from 0 to 7.
-static const uint8_t kDither565_4x4[16] = {
- 0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
-};
-
-// Convert I420 to RGB565 with dithering.
-LIBYUV_API
-int I420ToRGB565Dither(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb565,
- int dst_stride_rgb565,
- const uint8_t* dither4x4,
- int width,
- int height) {
- int y;
- void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
- const uint8_t* v_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) =
- I422ToARGBRow_C;
- void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb,
- const uint32_t dither4, int width) =
- ARGBToRGB565DitherRow_C;
- if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
- dst_stride_rgb565 = -dst_stride_rgb565;
- }
- if (!dither4x4) {
- dither4x4 = kDither565_4x4;
- }
-#if defined(HAS_I422TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGBRow = I422ToARGBRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I422TOARGBROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToARGBRow = I422ToARGBRow_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I422ToARGBRow = I422ToARGBRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToARGBRow = I422ToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGBRow = I422ToARGBRow_NEON;
- }
- }
-#endif
-#if defined(HAS_I422TOARGBROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToARGBRow = I422ToARGBRow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGBRow = I422ToARGBRow_MSA;
- }
- }
-#endif
-#if defined(HAS_I422TOARGBROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- I422ToARGBRow = I422ToARGBRow_Any_MMI;
- if (IS_ALIGNED(width, 4)) {
- I422ToARGBRow = I422ToARGBRow_MMI;
- }
- }
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
- if (IS_ALIGNED(width, 4)) {
- ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
- }
- }
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
- if (IS_ALIGNED(width, 8)) {
- ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
- }
- }
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA;
- }
- }
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MMI;
- if (IS_ALIGNED(width, 4)) {
- ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MMI;
- }
- }
-#endif
- {
- // Allocate a row of argb.
- align_buffer_64(row_argb, width * 4);
- for (y = 0; y < height; ++y) {
- I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width);
- ARGBToRGB565DitherRow(row_argb, dst_rgb565,
- *(const uint32_t*)(dither4x4 + ((y & 3) << 2)),
- width);
- dst_rgb565 += dst_stride_rgb565;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- free_aligned_buffer_64(row_argb);
- }
- return 0;
-}
-
-// Convert I420 to AR30 with matrix
-static int I420ToAR30Matrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_ar30,
- int dst_stride_ar30,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
- int y;
- void (*I422ToAR30Row)(const uint8_t* y_buf, const uint8_t* u_buf,
- const uint8_t* v_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) =
- I422ToAR30Row_C;
-
- if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
- dst_stride_ar30 = -dst_stride_ar30;
- }
-
-#if defined(HAS_I422TOAR30ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToAR30Row = I422ToAR30Row_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToAR30Row = I422ToAR30Row_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I422TOAR30ROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToAR30Row = I422ToAR30Row_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I422ToAR30Row = I422ToAR30Row_AVX2;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
- dst_ar30 += dst_stride_ar30;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- return 0;
-}
-
-// Convert I420 to AR30.
-LIBYUV_API
-int I420ToAR30(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_ar30,
- int dst_stride_ar30,
- int width,
- int height) {
- return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_ar30, dst_stride_ar30,
- &kYuvI601Constants, width, height);
-}
-
-// Convert H420 to AR30.
-LIBYUV_API
-int H420ToAR30(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_ar30,
- int dst_stride_ar30,
- int width,
- int height) {
- return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_ar30, dst_stride_ar30,
- &kYvuH709Constants, width, height);
-}
-
// Convert I420 to specified format
LIBYUV_API
int ConvertFromI420(const uint8_t* y,
@@ -1528,7 +645,6 @@ int ConvertFromI420(const uint8_t* y,
height);
break;
}
- // TODO(fbarchard): Add M420.
// Triplanar formats
case FOURCC_I420:
case FOURCC_YV12: {
diff --git a/TMessagesProj/jni/third_party/libyuv/source/convert_from_argb.cc b/TMessagesProj/jni/third_party/libyuv/source/convert_from_argb.cc
index de301ebbc..4ba4bb5e0 100644
--- a/TMessagesProj/jni/third_party/libyuv/source/convert_from_argb.cc
+++ b/TMessagesProj/jni/third_party/libyuv/source/convert_from_argb.cc
@@ -68,14 +68,6 @@ int ARGBToI444(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOUV444ROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToUV444Row = ARGBToUV444Row_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUV444Row = ARGBToUV444Row_MSA;
- }
- }
-#endif
#if defined(HAS_ARGBTOUV444ROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ARGBToUV444Row = ARGBToUV444Row_Any_MMI;
@@ -84,6 +76,14 @@ int ARGBToI444(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBTOUV444ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToUV444Row = ARGBToUV444Row_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUV444Row = ARGBToUV444Row_MSA;
+ }
+ }
+#endif
#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
@@ -108,14 +108,6 @@ int ARGBToI444(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToYRow = ARGBToYRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_MSA;
- }
- }
-#endif
#if defined(HAS_ARGBTOYROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ARGBToYRow = ARGBToYRow_Any_MMI;
@@ -124,6 +116,14 @@ int ARGBToI444(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBTOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBToUV444Row(src_argb, dst_u, dst_v, width);
@@ -207,36 +207,28 @@ int ARGBToI422(const uint8_t* src_argb,
}
#endif
-#if defined(HAS_ARGBTOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToYRow = ARGBToYRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_MSA;
+#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToYRow = ARGBToYRow_Any_MMI;
+ ARGBToUVRow = ARGBToUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_MMI;
}
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToUVRow = ARGBToUVRow_Any_MSA;
- if (IS_ALIGNED(width, 32)) {
- ARGBToUVRow = ARGBToUVRow_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_MMI;
}
}
#endif
-#if defined(HAS_ARGBTOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToYRow = ARGBToYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYRow = ARGBToYRow_MMI;
- }
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToUVRow = ARGBToUVRow_Any_MMI;
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
+ ARGBToUVRow = ARGBToUVRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_MMI;
+ ARGBToYRow = ARGBToYRow_MSA;
+ }
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_MSA;
}
}
#endif
@@ -315,38 +307,30 @@ int ARGBToNV12(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToYRow = ARGBToYRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_MSA;
- }
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToUVRow = ARGBToUVRow_Any_MSA;
- if (IS_ALIGNED(width, 32)) {
- ARGBToUVRow = ARGBToUVRow_MSA;
- }
- }
-#endif
-#if defined(HAS_ARGBTOYROW_MMI)
+#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ARGBToYRow = ARGBToYRow_Any_MMI;
+ ARGBToUVRow = ARGBToUVRow_Any_MMI;
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_MMI;
}
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToUVRow = ARGBToUVRow_Any_MMI;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_MMI;
}
}
#endif
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
+ ARGBToUVRow = ARGBToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ }
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_MSA;
+ }
+ }
+#endif
#if defined(HAS_MERGEUVROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
MergeUVRow_ = MergeUVRow_Any_SSE2;
@@ -371,14 +355,6 @@ int ARGBToNV12(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_MERGEUVROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- MergeUVRow_ = MergeUVRow_Any_MSA;
- if (IS_ALIGNED(halfwidth, 16)) {
- MergeUVRow_ = MergeUVRow_MSA;
- }
- }
-#endif
#if defined(HAS_MERGEUVROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
MergeUVRow_ = MergeUVRow_Any_MMI;
@@ -386,6 +362,14 @@ int ARGBToNV12(const uint8_t* src_argb,
MergeUVRow_ = MergeUVRow_MMI;
}
}
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MergeUVRow_ = MergeUVRow_Any_MSA;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_MSA;
+ }
+ }
#endif
{
// Allocate a rows of uv.
@@ -475,39 +459,30 @@ int ARGBToNV21(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToYRow = ARGBToYRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_MSA;
- }
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToUVRow = ARGBToUVRow_Any_MSA;
- if (IS_ALIGNED(width, 32)) {
- ARGBToUVRow = ARGBToUVRow_MSA;
- }
- }
-#endif
-#if defined(HAS_ARGBTOYROW_MMI)
+#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ARGBToYRow = ARGBToYRow_Any_MMI;
+ ARGBToUVRow = ARGBToUVRow_Any_MMI;
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_MMI;
}
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToUVRow = ARGBToUVRow_Any_MMI;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_MMI;
}
}
#endif
-
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
+ ARGBToUVRow = ARGBToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ }
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_MSA;
+ }
+ }
+#endif
#if defined(HAS_MERGEUVROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
MergeUVRow_ = MergeUVRow_Any_SSE2;
@@ -532,14 +507,6 @@ int ARGBToNV21(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_MERGEUVROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- MergeUVRow_ = MergeUVRow_Any_MSA;
- if (IS_ALIGNED(halfwidth, 16)) {
- MergeUVRow_ = MergeUVRow_MSA;
- }
- }
-#endif
#if defined(HAS_MERGEUVROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
MergeUVRow_ = MergeUVRow_Any_MMI;
@@ -547,6 +514,14 @@ int ARGBToNV21(const uint8_t* src_argb,
MergeUVRow_ = MergeUVRow_MMI;
}
}
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MergeUVRow_ = MergeUVRow_Any_MSA;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_MSA;
+ }
+ }
#endif
{
// Allocate a rows of uv.
@@ -635,38 +610,30 @@ int ABGRToNV12(const uint8_t* src_abgr,
}
}
#endif
-#if defined(HAS_ABGRTOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ABGRToYRow = ABGRToYRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- ABGRToYRow = ABGRToYRow_MSA;
- }
- }
-#endif
-#if defined(HAS_ABGRTOUVROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ABGRToUVRow = ABGRToUVRow_Any_MSA;
- if (IS_ALIGNED(width, 32)) {
- ABGRToUVRow = ABGRToUVRow_MSA;
- }
- }
-#endif
-#if defined(HAS_ABGRTOYROW_MMI)
+#if defined(HAS_ABGRTOYROW_MMI) && defined(HAS_ABGRTOUVROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ABGRToYRow = ABGRToYRow_Any_MMI;
+ ABGRToUVRow = ABGRToUVRow_Any_MMI;
if (IS_ALIGNED(width, 8)) {
ABGRToYRow = ABGRToYRow_MMI;
}
- }
-#endif
-#if defined(HAS_ABGRTOUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ABGRToUVRow = ABGRToUVRow_Any_MMI;
if (IS_ALIGNED(width, 16)) {
ABGRToUVRow = ABGRToUVRow_MMI;
}
}
#endif
+#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ABGRToYRow = ABGRToYRow_Any_MSA;
+ ABGRToUVRow = ABGRToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToYRow = ABGRToYRow_MSA;
+ }
+ if (IS_ALIGNED(width, 32)) {
+ ABGRToUVRow = ABGRToUVRow_MSA;
+ }
+ }
+#endif
#if defined(HAS_MERGEUVROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
MergeUVRow_ = MergeUVRow_Any_SSE2;
@@ -691,14 +658,6 @@ int ABGRToNV12(const uint8_t* src_abgr,
}
}
#endif
-#if defined(HAS_MERGEUVROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- MergeUVRow_ = MergeUVRow_Any_MSA;
- if (IS_ALIGNED(halfwidth, 16)) {
- MergeUVRow_ = MergeUVRow_MSA;
- }
- }
-#endif
#if defined(HAS_MERGEUVROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
MergeUVRow_ = MergeUVRow_Any_MMI;
@@ -706,6 +665,14 @@ int ABGRToNV12(const uint8_t* src_abgr,
MergeUVRow_ = MergeUVRow_MMI;
}
}
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MergeUVRow_ = MergeUVRow_Any_MSA;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_MSA;
+ }
+ }
#endif
{
// Allocate a rows of uv.
@@ -795,39 +762,30 @@ int ABGRToNV21(const uint8_t* src_abgr,
}
}
#endif
-#if defined(HAS_ABGRTOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ABGRToYRow = ABGRToYRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- ABGRToYRow = ABGRToYRow_MSA;
- }
- }
-#endif
-#if defined(HAS_ABGRTOUVROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ABGRToUVRow = ABGRToUVRow_Any_MSA;
- if (IS_ALIGNED(width, 32)) {
- ABGRToUVRow = ABGRToUVRow_MSA;
- }
- }
-#endif
-#if defined(HAS_ABGRTOYROW_MMI)
+#if defined(HAS_ABGRTOYROW_MMI) && defined(HAS_ABGRTOUVROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ABGRToYRow = ABGRToYRow_Any_MMI;
+ ABGRToUVRow = ABGRToUVRow_Any_MMI;
if (IS_ALIGNED(width, 8)) {
ABGRToYRow = ABGRToYRow_MMI;
}
- }
-#endif
-#if defined(HAS_ABGRTOUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ABGRToUVRow = ABGRToUVRow_Any_MMI;
if (IS_ALIGNED(width, 16)) {
ABGRToUVRow = ABGRToUVRow_MMI;
}
}
#endif
-
+#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ABGRToYRow = ABGRToYRow_Any_MSA;
+ ABGRToUVRow = ABGRToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToYRow = ABGRToYRow_MSA;
+ }
+ if (IS_ALIGNED(width, 32)) {
+ ABGRToUVRow = ABGRToUVRow_MSA;
+ }
+ }
+#endif
#if defined(HAS_MERGEUVROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
MergeUVRow_ = MergeUVRow_Any_SSE2;
@@ -852,14 +810,6 @@ int ABGRToNV21(const uint8_t* src_abgr,
}
}
#endif
-#if defined(HAS_MERGEUVROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- MergeUVRow_ = MergeUVRow_Any_MSA;
- if (IS_ALIGNED(halfwidth, 16)) {
- MergeUVRow_ = MergeUVRow_MSA;
- }
- }
-#endif
#if defined(HAS_MERGEUVROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
MergeUVRow_ = MergeUVRow_Any_MMI;
@@ -867,6 +817,14 @@ int ABGRToNV21(const uint8_t* src_abgr,
MergeUVRow_ = MergeUVRow_MMI;
}
}
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MergeUVRow_ = MergeUVRow_Any_MSA;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_MSA;
+ }
+ }
#endif
{
// Allocate a rows of uv.
@@ -961,38 +919,30 @@ int ARGBToYUY2(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToYRow = ARGBToYRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_MSA;
- }
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToUVRow = ARGBToUVRow_Any_MSA;
- if (IS_ALIGNED(width, 32)) {
- ARGBToUVRow = ARGBToUVRow_MSA;
- }
- }
-#endif
-#if defined(HAS_ARGBTOYROW_MMI)
+#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ARGBToYRow = ARGBToYRow_Any_MMI;
+ ARGBToUVRow = ARGBToUVRow_Any_MMI;
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_MMI;
}
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToUVRow = ARGBToUVRow_Any_MMI;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_MMI;
}
}
#endif
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
+ ARGBToUVRow = ARGBToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ }
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_MSA;
+ }
+ }
+#endif
#if defined(HAS_I422TOYUY2ROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
@@ -1017,14 +967,6 @@ int ARGBToYUY2(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_I422TOYUY2ROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToYUY2Row = I422ToYUY2Row_Any_MSA;
- if (IS_ALIGNED(width, 32)) {
- I422ToYUY2Row = I422ToYUY2Row_MSA;
- }
- }
-#endif
#if defined(HAS_I422TOYUY2ROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
I422ToYUY2Row = I422ToYUY2Row_Any_MMI;
@@ -1033,6 +975,14 @@ int ARGBToYUY2(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_I422TOYUY2ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToYUY2Row = I422ToYUY2Row_MSA;
+ }
+ }
+#endif
{
// Allocate a rows of yuv.
@@ -1122,38 +1072,30 @@ int ARGBToUYVY(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToYRow = ARGBToYRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_MSA;
- }
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToUVRow = ARGBToUVRow_Any_MSA;
- if (IS_ALIGNED(width, 32)) {
- ARGBToUVRow = ARGBToUVRow_MSA;
- }
- }
-#endif
-#if defined(HAS_ARGBTOYROW_MMI)
+#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ARGBToYRow = ARGBToYRow_Any_MMI;
+ ARGBToUVRow = ARGBToUVRow_Any_MMI;
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_MMI;
}
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToUVRow = ARGBToUVRow_Any_MMI;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_MMI;
}
}
#endif
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
+ ARGBToUVRow = ARGBToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ }
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_MSA;
+ }
+ }
+#endif
#if defined(HAS_I422TOUYVYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
@@ -1178,14 +1120,6 @@ int ARGBToUYVY(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_I422TOUYVYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
- if (IS_ALIGNED(width, 32)) {
- I422ToUYVYRow = I422ToUYVYRow_MSA;
- }
- }
-#endif
#if defined(HAS_I422TOUYVYROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
I422ToUYVYRow = I422ToUYVYRow_Any_MMI;
@@ -1194,6 +1128,14 @@ int ARGBToUYVY(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_I422TOUYVYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToUYVYRow = I422ToUYVYRow_MSA;
+ }
+ }
+#endif
{
// Allocate a rows of yuv.
@@ -1263,14 +1205,6 @@ int ARGBToI400(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToYRow = ARGBToYRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_MSA;
- }
- }
-#endif
#if defined(HAS_ARGBTOYROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ARGBToYRow = ARGBToYRow_Any_MMI;
@@ -1279,6 +1213,14 @@ int ARGBToI400(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBTOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBToYRow(src_argb, dst_y, width);
@@ -1361,14 +1303,6 @@ int ARGBToRGB24(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTORGB24ROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToRGB24Row = ARGBToRGB24Row_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- ARGBToRGB24Row = ARGBToRGB24Row_MSA;
- }
- }
-#endif
#if defined(HAS_ARGBTORGB24ROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ARGBToRGB24Row = ARGBToRGB24Row_Any_MMI;
@@ -1377,6 +1311,14 @@ int ARGBToRGB24(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBTORGB24ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBToRGB24Row(src_argb, dst_rgb24, width);
@@ -1435,14 +1377,6 @@ int ARGBToRAW(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTORAWROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToRAWRow = ARGBToRAWRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- ARGBToRAWRow = ARGBToRAWRow_MSA;
- }
- }
-#endif
#if defined(HAS_ARGBTORAWROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ARGBToRAWRow = ARGBToRAWRow_Any_MMI;
@@ -1451,6 +1385,14 @@ int ARGBToRAW(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBTORAWROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToRAWRow = ARGBToRAWRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToRAWRow = ARGBToRAWRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBToRAWRow(src_argb, dst_raw, width);
@@ -1513,14 +1455,6 @@ int ARGBToRGB565Dither(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA;
- }
- }
-#endif
#if defined(HAS_ARGBTORGB565DITHERROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MMI;
@@ -1529,6 +1463,14 @@ int ARGBToRGB565Dither(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBToRGB565DitherRow(src_argb, dst_rgb565,
@@ -1590,14 +1532,6 @@ int ARGBToRGB565(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTORGB565ROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToRGB565Row = ARGBToRGB565Row_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- ARGBToRGB565Row = ARGBToRGB565Row_MSA;
- }
- }
-#endif
#if defined(HAS_ARGBTORGB565ROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ARGBToRGB565Row = ARGBToRGB565Row_Any_MMI;
@@ -1606,6 +1540,14 @@ int ARGBToRGB565(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBTORGB565ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBToRGB565Row(src_argb, dst_rgb565, width);
@@ -1664,14 +1606,6 @@ int ARGBToARGB1555(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOARGB1555ROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToARGB1555Row = ARGBToARGB1555Row_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- ARGBToARGB1555Row = ARGBToARGB1555Row_MSA;
- }
- }
-#endif
#if defined(HAS_ARGBTOARGB1555ROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ARGBToARGB1555Row = ARGBToARGB1555Row_Any_MMI;
@@ -1680,6 +1614,14 @@ int ARGBToARGB1555(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBTOARGB1555ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBToARGB1555Row(src_argb, dst_argb1555, width);
@@ -1738,14 +1680,6 @@ int ARGBToARGB4444(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOARGB4444ROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToARGB4444Row = ARGBToARGB4444Row_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- ARGBToARGB4444Row = ARGBToARGB4444Row_MSA;
- }
- }
-#endif
#if defined(HAS_ARGBTOARGB4444ROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ARGBToARGB4444Row = ARGBToARGB4444Row_Any_MMI;
@@ -1754,6 +1688,14 @@ int ARGBToARGB4444(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBTOARGB4444ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBToARGB4444Row(src_argb, dst_argb4444, width);
@@ -1922,35 +1864,27 @@ int ARGBToJ420(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOYJROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToYJRow = ARGBToYJRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- ARGBToYJRow = ARGBToYJRow_MSA;
- }
- }
-#endif
-#if defined(HAS_ARGBTOYJROW_MMI)
+#if defined(HAS_ARGBTOYJROW_MMI) && defined(HAS_ARGBTOUVJROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ARGBToYJRow = ARGBToYJRow_Any_MMI;
+ ARGBToUVJRow = ARGBToUVJRow_Any_MMI;
if (IS_ALIGNED(width, 8)) {
ARGBToYJRow = ARGBToYJRow_MMI;
}
- }
-#endif
-#if defined(HAS_ARGBTOUVJROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToUVJRow = ARGBToUVJRow_Any_MSA;
- if (IS_ALIGNED(width, 32)) {
- ARGBToUVJRow = ARGBToUVJRow_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_MMI;
}
}
#endif
-#if defined(HAS_ARGBTOUVJROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToUVJRow = ARGBToUVJRow_Any_MMI;
+#if defined(HAS_ARGBTOYJROW_MSA) && defined(HAS_ARGBTOUVJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYJRow = ARGBToYJRow_Any_MSA;
+ ARGBToUVJRow = ARGBToUVJRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVJRow = ARGBToUVJRow_MMI;
+ ARGBToYJRow = ARGBToYJRow_MSA;
+ }
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVJRow = ARGBToUVJRow_MSA;
}
}
#endif
@@ -2039,35 +1973,27 @@ int ARGBToJ422(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOYJROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToYJRow = ARGBToYJRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- ARGBToYJRow = ARGBToYJRow_MSA;
- }
- }
-#endif
-#if defined(HAS_ARGBTOYJROW_MMI)
+#if defined(HAS_ARGBTOYJROW_MMI) && defined(HAS_ARGBTOUVJROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ARGBToYJRow = ARGBToYJRow_Any_MMI;
+ ARGBToUVJRow = ARGBToUVJRow_Any_MMI;
if (IS_ALIGNED(width, 8)) {
ARGBToYJRow = ARGBToYJRow_MMI;
}
- }
-#endif
-#if defined(HAS_ARGBTOUVJROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToUVJRow = ARGBToUVJRow_Any_MSA;
- if (IS_ALIGNED(width, 32)) {
- ARGBToUVJRow = ARGBToUVJRow_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_MMI;
}
}
#endif
-#if defined(HAS_ARGBTOUVJROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToUVJRow = ARGBToUVJRow_Any_MMI;
+#if defined(HAS_ARGBTOYJROW_MSA) && defined(HAS_ARGBTOUVJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYJRow = ARGBToYJRow_Any_MSA;
+ ARGBToUVJRow = ARGBToUVJRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVJRow = ARGBToUVJRow_MMI;
+ ARGBToYJRow = ARGBToYJRow_MSA;
+ }
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVJRow = ARGBToUVJRow_MSA;
}
}
#endif
@@ -2132,14 +2058,6 @@ int ARGBToJ400(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOYJROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToYJRow = ARGBToYJRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- ARGBToYJRow = ARGBToYJRow_MSA;
- }
- }
-#endif
#if defined(HAS_ARGBTOYJROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ARGBToYJRow = ARGBToYJRow_Any_MMI;
@@ -2148,6 +2066,14 @@ int ARGBToJ400(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBTOYJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYJRow = ARGBToYJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYJRow = ARGBToYJRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBToYJRow(src_argb, dst_yj, width);
@@ -2206,14 +2132,6 @@ int RGBAToJ400(const uint8_t* src_rgba,
}
}
#endif
-#if defined(HAS_RGBATOYJROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- RGBAToYJRow = RGBAToYJRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- RGBAToYJRow = RGBAToYJRow_MSA;
- }
- }
-#endif
#if defined(HAS_RGBATOYJROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
RGBAToYJRow = RGBAToYJRow_Any_MMI;
@@ -2222,6 +2140,14 @@ int RGBAToJ400(const uint8_t* src_rgba,
}
}
#endif
+#if defined(HAS_RGBATOYJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RGBAToYJRow = RGBAToYJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RGBAToYJRow = RGBAToYJRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
RGBAToYJRow(src_rgba, dst_yj, width);
diff --git a/TMessagesProj/jni/third_party/libyuv/source/convert_jpeg.cc b/TMessagesProj/jni/third_party/libyuv/source/convert_jpeg.cc
index f440c7c2e..d7556ee91 100644
--- a/TMessagesProj/jni/third_party/libyuv/source/convert_jpeg.cc
+++ b/TMessagesProj/jni/third_party/libyuv/source/convert_jpeg.cc
@@ -328,6 +328,140 @@ int MJPGToNV21(const uint8_t* src_mjpg,
return ret ? 0 : 1;
}
+static void JpegI420ToNV12(void* opaque,
+ const uint8_t* const* data,
+ const int* strides,
+ int rows) {
+ NV21Buffers* dest = (NV21Buffers*)(opaque);
+ // Use NV21 with VU swapped.
+ I420ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1],
+ dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+ dest->h -= rows;
+}
+
+static void JpegI422ToNV12(void* opaque,
+ const uint8_t* const* data,
+ const int* strides,
+ int rows) {
+ NV21Buffers* dest = (NV21Buffers*)(opaque);
+ // Use NV21 with VU swapped.
+ I422ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1],
+ dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+ dest->h -= rows;
+}
+
+static void JpegI444ToNV12(void* opaque,
+ const uint8_t* const* data,
+ const int* strides,
+ int rows) {
+ NV21Buffers* dest = (NV21Buffers*)(opaque);
+ // Use NV21 with VU swapped.
+ I444ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1],
+ dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+ dest->h -= rows;
+}
+
+static void JpegI400ToNV12(void* opaque,
+ const uint8_t* const* data,
+ const int* strides,
+ int rows) {
+ NV21Buffers* dest = (NV21Buffers*)(opaque);
+ // Use NV21 since there is no UV plane.
+ I400ToNV21(data[0], strides[0], dest->y, dest->y_stride, dest->vu,
+ dest->vu_stride, dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+ dest->h -= rows;
+}
+
+// MJPG (Motion JPEG) to NV12.
+LIBYUV_API
+int MJPGToNV12(const uint8_t* sample,
+ size_t sample_size,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height) {
+ if (sample_size == kUnknownDataSize) {
+ // ERROR: MJPEG frame size unknown
+ return -1;
+ }
+
+ // TODO(fbarchard): Port MJpeg to C.
+ MJpegDecoder mjpeg_decoder;
+ LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+ if (ret && (mjpeg_decoder.GetWidth() != src_width ||
+ mjpeg_decoder.GetHeight() != src_height)) {
+ // ERROR: MJPEG frame has unexpected dimensions
+ mjpeg_decoder.UnloadFrame();
+ return 1; // runtime failure
+ }
+ if (ret) {
+ // Use NV21Buffers but with UV instead of VU.
+ NV21Buffers bufs = {dst_y, dst_stride_y, dst_uv,
+ dst_stride_uv, dst_width, dst_height};
+ // YUV420
+ if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToNV12, &bufs, dst_width,
+ dst_height);
+ // YUV422
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToNV12, &bufs, dst_width,
+ dst_height);
+ // YUV444
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToNV12, &bufs, dst_width,
+ dst_height);
+ // YUV400
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceGrayscale &&
+ mjpeg_decoder.GetNumComponents() == 1 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToNV12, &bufs, dst_width,
+ dst_height);
+ } else {
+ // Unknown colorspace.
+ mjpeg_decoder.UnloadFrame();
+ return 1;
+ }
+ }
+ return ret ? 0 : 1;
+}
+
struct ARGBBuffers {
uint8_t* argb;
int argb_stride;
diff --git a/TMessagesProj/jni/third_party/libyuv/source/convert_to_argb.cc b/TMessagesProj/jni/third_party/libyuv/source/convert_to_argb.cc
index c08f61013..84df16c8c 100644
--- a/TMessagesProj/jni/third_party/libyuv/source/convert_to_argb.cc
+++ b/TMessagesProj/jni/third_party/libyuv/source/convert_to_argb.cc
@@ -180,12 +180,6 @@ int ConvertToARGB(const uint8_t* sample,
r = NV21ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb,
dst_stride_argb, crop_width, inv_crop_height);
break;
- case FOURCC_M420:
- src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
- r = M420ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width,
- inv_crop_height);
- break;
-
// Triplanar formats
case FOURCC_I420:
case FOURCC_YV12: {
diff --git a/TMessagesProj/jni/third_party/libyuv/source/convert_to_i420.cc b/TMessagesProj/jni/third_party/libyuv/source/convert_to_i420.cc
index 584be0ac3..ac6eeab24 100644
--- a/TMessagesProj/jni/third_party/libyuv/source/convert_to_i420.cc
+++ b/TMessagesProj/jni/third_party/libyuv/source/convert_to_i420.cc
@@ -179,11 +179,6 @@ int ConvertToI420(const uint8_t* sample,
dst_stride_y, dst_v, dst_stride_v, dst_u,
dst_stride_u, crop_width, inv_crop_height, rotation);
break;
- case FOURCC_M420:
- src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
- r = M420ToI420(src, src_width, dst_y, dst_stride_y, dst_u, dst_stride_u,
- dst_v, dst_stride_v, crop_width, inv_crop_height);
- break;
// Triplanar formats
case FOURCC_I420:
case FOURCC_YV12: {
diff --git a/TMessagesProj/jni/third_party/libyuv/source/cpu_id.cc b/TMessagesProj/jni/third_party/libyuv/source/cpu_id.cc
index 48e2b6152..fe89452b7 100644
--- a/TMessagesProj/jni/third_party/libyuv/source/cpu_id.cc
+++ b/TMessagesProj/jni/third_party/libyuv/source/cpu_id.cc
@@ -75,9 +75,9 @@ void CpuId(int info_eax, int info_ecx, int* cpu_info) {
asm volatile(
#if defined(__i386__) && defined(__PIC__)
// Preserve ebx for fpic 32 bit.
- "mov %%ebx, %%edi \n"
+ "mov %%ebx, %%edi \n"
"cpuid \n"
- "xchg %%edi, %%ebx \n"
+ "xchg %%edi, %%ebx \n"
: "=D"(info_ebx),
#else
"cpuid \n"
@@ -163,44 +163,38 @@ LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) {
}
// TODO(fbarchard): Consider read_msa_ir().
-// TODO(fbarchard): Add unittest.
-LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name,
- const char ase[]) {
+LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name) {
char cpuinfo_line[512];
+ int flag = 0x0;
FILE* f = fopen(cpuinfo_name, "r");
if (!f) {
- // ase enabled if /proc/cpuinfo is unavailable.
- if (strcmp(ase, " msa") == 0) {
- return kCpuHasMSA;
- }
- if (strcmp(ase, " mmi") == 0) {
- return kCpuHasMMI;
- }
+ // Assume nothing if /proc/cpuinfo is unavailable.
+ // This will occur for Chrome sandbox for Pepper or Render process.
return 0;
}
while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
+ if (memcmp(cpuinfo_line, "cpu model", 9) == 0) {
+ // Workaround early kernel without mmi in ASEs line.
+ if (strstr(cpuinfo_line, "Loongson-3")) {
+ flag |= kCpuHasMMI;
+ } else if (strstr(cpuinfo_line, "Loongson-2K")) {
+ flag |= kCpuHasMMI | kCpuHasMSA;
+ }
+ }
if (memcmp(cpuinfo_line, "ASEs implemented", 16) == 0) {
- char* p = strstr(cpuinfo_line, ase);
- if (p) {
- fclose(f);
- if (strcmp(ase, " msa") == 0) {
- return kCpuHasMSA;
- }
- return 0;
+ if (strstr(cpuinfo_line, "loongson-mmi") &&
+ strstr(cpuinfo_line, "loongson-ext")) {
+ flag |= kCpuHasMMI;
}
- } else if (memcmp(cpuinfo_line, "cpu model", 9) == 0) {
- char* p = strstr(cpuinfo_line, "Loongson-3");
- if (p) {
- fclose(f);
- if (strcmp(ase, " mmi") == 0) {
- return kCpuHasMMI;
- }
- return 0;
+ if (strstr(cpuinfo_line, "msa")) {
+ flag |= kCpuHasMSA;
}
+ // ASEs is the last line, so we can break here.
+ break;
}
}
fclose(f);
- return 0;
+ return flag;
}
static SAFEBUFFERS int GetCpuFlags(void) {
@@ -242,11 +236,7 @@ static SAFEBUFFERS int GetCpuFlags(void) {
}
#endif
#if defined(__mips__) && defined(__linux__)
-#if defined(__mips_msa)
- cpu_info = MipsCpuCaps("/proc/cpuinfo", " msa");
-#elif defined(_MIPS_ARCH_LOONGSON3A)
- cpu_info = MipsCpuCaps("/proc/cpuinfo", " mmi");
-#endif
+ cpu_info = MipsCpuCaps("/proc/cpuinfo");
cpu_info |= kCpuHasMIPS;
#endif
#if defined(__arm__) || defined(__aarch64__)
diff --git a/TMessagesProj/jni/third_party/libyuv/source/planar_functions.cc b/TMessagesProj/jni/third_party/libyuv/source/planar_functions.cc
index 1aa151b62..d5cd7e680 100644
--- a/TMessagesProj/jni/third_party/libyuv/source/planar_functions.cc
+++ b/TMessagesProj/jni/third_party/libyuv/source/planar_functions.cc
@@ -402,14 +402,6 @@ void SplitUVPlane(const uint8_t* src_uv,
}
}
#endif
-#if defined(HAS_SPLITUVROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- SplitUVRow = SplitUVRow_Any_MSA;
- if (IS_ALIGNED(width, 32)) {
- SplitUVRow = SplitUVRow_MSA;
- }
- }
-#endif
#if defined(HAS_SPLITUVROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
SplitUVRow = SplitUVRow_Any_MMI;
@@ -418,6 +410,14 @@ void SplitUVPlane(const uint8_t* src_uv,
}
}
#endif
+#if defined(HAS_SPLITUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ SplitUVRow = SplitUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ SplitUVRow = SplitUVRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
// Copy a row of UV.
@@ -477,14 +477,6 @@ void MergeUVPlane(const uint8_t* src_u,
}
}
#endif
-#if defined(HAS_MERGEUVROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- MergeUVRow = MergeUVRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- MergeUVRow = MergeUVRow_MSA;
- }
- }
-#endif
#if defined(HAS_MERGEUVROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
MergeUVRow = MergeUVRow_Any_MMI;
@@ -493,6 +485,14 @@ void MergeUVPlane(const uint8_t* src_u,
}
}
#endif
+#if defined(HAS_MERGEUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MergeUVRow = MergeUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ MergeUVRow = MergeUVRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
// Merge a row of U and V into a row of UV.
@@ -579,6 +579,15 @@ int NV21ToNV12(const uint8_t* src_y,
if (dst_y) {
CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
}
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_vu = src_vu + (halfheight - 1) * src_stride_vu;
+ src_stride_vu = -src_stride_vu;
+ }
+
SwapUVPlane(src_vu, src_stride_vu, dst_uv, dst_stride_uv, halfwidth,
halfheight);
return 0;
@@ -625,14 +634,6 @@ void SplitRGBPlane(const uint8_t* src_rgb,
}
}
#endif
-#if defined(HAS_SPLITRGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- SplitRGBRow = SplitRGBRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- SplitRGBRow = SplitRGBRow_NEON;
- }
- }
-#endif
#if defined(HAS_SPLITRGBROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
SplitRGBRow = SplitRGBRow_Any_MMI;
@@ -641,6 +642,14 @@ void SplitRGBPlane(const uint8_t* src_rgb,
}
}
#endif
+#if defined(HAS_SPLITRGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SplitRGBRow = SplitRGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ SplitRGBRow = SplitRGBRow_NEON;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
// Copy a row of RGB.
@@ -716,70 +725,6 @@ void MergeRGBPlane(const uint8_t* src_r,
}
}
-// Mirror a plane of data.
-void MirrorPlane(const uint8_t* src_y,
- int src_stride_y,
- uint8_t* dst_y,
- int dst_stride_y,
- int width,
- int height) {
- int y;
- void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_y = src_y + (height - 1) * src_stride_y;
- src_stride_y = -src_stride_y;
- }
-#if defined(HAS_MIRRORROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- MirrorRow = MirrorRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- MirrorRow = MirrorRow_NEON;
- }
- }
-#endif
-#if defined(HAS_MIRRORROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- MirrorRow = MirrorRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- MirrorRow = MirrorRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_MIRRORROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- MirrorRow = MirrorRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- MirrorRow = MirrorRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_MIRRORROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- MirrorRow = MirrorRow_Any_MSA;
- if (IS_ALIGNED(width, 64)) {
- MirrorRow = MirrorRow_MSA;
- }
- }
-#endif
-#if defined(HAS_MIRRORROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- MirrorRow = MirrorRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- MirrorRow = MirrorRow_MMI;
- }
- }
-#endif
-
- // Mirror plane
- for (y = 0; y < height; ++y) {
- MirrorRow(src_y, dst_y, width);
- src_y += src_stride_y;
- dst_y += dst_stride_y;
- }
-}
-
// Convert YUY2 to I422.
LIBYUV_API
int YUY2ToI422(const uint8_t* src_yuy2,
@@ -844,17 +789,7 @@ int YUY2ToI422(const uint8_t* src_yuy2,
}
}
#endif
-#if defined(HAS_YUY2TOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- YUY2ToYRow = YUY2ToYRow_Any_MSA;
- YUY2ToUV422Row = YUY2ToUV422Row_Any_MSA;
- if (IS_ALIGNED(width, 32)) {
- YUY2ToYRow = YUY2ToYRow_MSA;
- YUY2ToUV422Row = YUY2ToUV422Row_MSA;
- }
- }
-#endif
-#if defined(HAS_YUY2TOYROW_MMI)
+#if defined(HAS_YUY2TOYROW_MMI) && defined(HAS_YUY2TOUV422ROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
YUY2ToYRow = YUY2ToYRow_Any_MMI;
YUY2ToUV422Row = YUY2ToUV422Row_Any_MMI;
@@ -864,6 +799,16 @@ int YUY2ToI422(const uint8_t* src_yuy2,
}
}
#endif
+#if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUV422ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ YUY2ToYRow = YUY2ToYRow_Any_MSA;
+ YUY2ToUV422Row = YUY2ToUV422Row_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ YUY2ToYRow = YUY2ToYRow_MSA;
+ YUY2ToUV422Row = YUY2ToUV422Row_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
@@ -940,17 +885,7 @@ int UYVYToI422(const uint8_t* src_uyvy,
}
}
#endif
-#if defined(HAS_UYVYTOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- UYVYToYRow = UYVYToYRow_Any_MSA;
- UYVYToUV422Row = UYVYToUV422Row_Any_MSA;
- if (IS_ALIGNED(width, 32)) {
- UYVYToYRow = UYVYToYRow_MSA;
- UYVYToUV422Row = UYVYToUV422Row_MSA;
- }
- }
-#endif
-#if defined(HAS_UYVYTOYROW_MMI)
+#if defined(HAS_UYVYTOYROW_MMI) && defined(HAS_UYVYTOUV422ROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
UYVYToYRow = UYVYToYRow_Any_MMI;
UYVYToUV422Row = UYVYToUV422Row_Any_MMI;
@@ -960,6 +895,16 @@ int UYVYToI422(const uint8_t* src_uyvy,
}
}
#endif
+#if defined(HAS_UYVYTOYROW_MSA) && defined(HAS_UYVYTOUV422ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ UYVYToYRow = UYVYToYRow_Any_MSA;
+ UYVYToUV422Row = UYVYToUV422Row_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ UYVYToYRow = UYVYToYRow_MSA;
+ UYVYToUV422Row = UYVYToUV422Row_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
UYVYToUV422Row(src_uyvy, dst_u, dst_v, width);
@@ -1022,14 +967,6 @@ int YUY2ToY(const uint8_t* src_yuy2,
}
}
#endif
-#if defined(HAS_YUY2TOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- YUY2ToYRow = YUY2ToYRow_Any_MSA;
- if (IS_ALIGNED(width, 32)) {
- YUY2ToYRow = YUY2ToYRow_MSA;
- }
- }
-#endif
#if defined(HAS_YUY2TOYROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
YUY2ToYRow = YUY2ToYRow_Any_MMI;
@@ -1038,6 +975,14 @@ int YUY2ToY(const uint8_t* src_yuy2,
}
}
#endif
+#if defined(HAS_YUY2TOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ YUY2ToYRow = YUY2ToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ YUY2ToYRow = YUY2ToYRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
YUY2ToYRow(src_yuy2, dst_y, width);
@@ -1047,6 +992,130 @@ int YUY2ToY(const uint8_t* src_yuy2,
return 0;
}
+// Mirror a plane of data.
+// See Also I400Mirror
+LIBYUV_API
+void MirrorPlane(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height) {
+ int y;
+ void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+#if defined(HAS_MIRRORROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MirrorRow = MirrorRow_Any_NEON;
+ if (IS_ALIGNED(width, 32)) {
+ MirrorRow = MirrorRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ MirrorRow = MirrorRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ MirrorRow = MirrorRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MirrorRow = MirrorRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ MirrorRow = MirrorRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ MirrorRow = MirrorRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ MirrorRow = MirrorRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MirrorRow = MirrorRow_Any_MSA;
+ if (IS_ALIGNED(width, 64)) {
+ MirrorRow = MirrorRow_MSA;
+ }
+ }
+#endif
+
+ // Mirror plane
+ for (y = 0; y < height; ++y) {
+ MirrorRow(src_y, dst_y, width);
+ src_y += src_stride_y;
+ dst_y += dst_stride_y;
+ }
+}
+
+// Mirror a plane of UV data.
+LIBYUV_API
+void MirrorUVPlane(const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int y;
+ void (*MirrorUVRow)(const uint8_t* src, uint8_t* dst, int width) =
+ MirrorUVRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_uv = src_uv + (height - 1) * src_stride_uv;
+ src_stride_uv = -src_stride_uv;
+ }
+#if defined(HAS_MIRRORUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MirrorUVRow = MirrorUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 32)) {
+ MirrorUVRow = MirrorUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ MirrorUVRow = MirrorUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ MirrorUVRow = MirrorUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MirrorUVRow = MirrorUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ MirrorUVRow = MirrorUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MirrorUVRow = MirrorUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ MirrorUVRow = MirrorUVRow_MSA;
+ }
+ }
+#endif
+
+ // MirrorUV plane
+ for (y = 0; y < height; ++y) {
+ MirrorUVRow(src_uv, dst_uv, width);
+ src_uv += src_stride_uv;
+ dst_uv += dst_stride_uv;
+ }
+}
+
// Mirror I400 with optional flipping
LIBYUV_API
int I400Mirror(const uint8_t* src_y,
@@ -1087,7 +1156,7 @@ int I420Mirror(const uint8_t* src_y,
int height) {
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
- if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v || width <= 0 ||
+ if (!src_y || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
height == 0) {
return -1;
}
@@ -1111,6 +1180,41 @@ int I420Mirror(const uint8_t* src_y,
return 0;
}
+// NV12 mirror.
+LIBYUV_API
+int NV12Mirror(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_y || !src_uv || !dst_uv || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_uv = src_uv + (halfheight - 1) * src_stride_uv;
+ src_stride_y = -src_stride_y;
+ src_stride_uv = -src_stride_uv;
+ }
+
+ if (dst_y) {
+ MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+ MirrorUVPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, halfwidth,
+ halfheight);
+ return 0;
+}
+
// ARGB mirror.
LIBYUV_API
int ARGBMirror(const uint8_t* src_argb,
@@ -1134,7 +1238,7 @@ int ARGBMirror(const uint8_t* src_argb,
#if defined(HAS_ARGBMIRRORROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
- if (IS_ALIGNED(width, 4)) {
+ if (IS_ALIGNED(width, 8)) {
ARGBMirrorRow = ARGBMirrorRow_NEON;
}
}
@@ -1155,14 +1259,6 @@ int ARGBMirror(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBMIRRORROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBMirrorRow = ARGBMirrorRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- ARGBMirrorRow = ARGBMirrorRow_MSA;
- }
- }
-#endif
#if defined(HAS_ARGBMIRRORROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ARGBMirrorRow = ARGBMirrorRow_Any_MMI;
@@ -1171,6 +1267,14 @@ int ARGBMirror(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBMIRRORROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBMirrorRow = ARGBMirrorRow_MSA;
+ }
+ }
+#endif
// Mirror plane
for (y = 0; y < height; ++y) {
@@ -1181,6 +1285,52 @@ int ARGBMirror(const uint8_t* src_argb,
return 0;
}
+// RGB24 mirror.
+LIBYUV_API
+int RGB24Mirror(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height) {
+ int y;
+ void (*RGB24MirrorRow)(const uint8_t* src, uint8_t* dst, int width) =
+ RGB24MirrorRow_C;
+ if (!src_rgb24 || !dst_rgb24 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+ src_stride_rgb24 = -src_stride_rgb24;
+ }
+#if defined(HAS_RGB24MIRRORROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGB24MirrorRow = RGB24MirrorRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24MirrorRow = RGB24MirrorRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_RGB24MIRRORROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RGB24MirrorRow = RGB24MirrorRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24MirrorRow = RGB24MirrorRow_SSSE3;
+ }
+ }
+#endif
+
+ // Mirror plane
+ for (y = 0; y < height; ++y) {
+ RGB24MirrorRow(src_rgb24, dst_rgb24, width);
+ src_rgb24 += src_stride_rgb24;
+ dst_rgb24 += dst_stride_rgb24;
+ }
+ return 0;
+}
+
// Get a blender that optimized for the CPU and pixel count.
// As there are 6 blenders to choose from, the caller should try to use
// the same blend function for all pixels if possible.
@@ -1199,15 +1349,15 @@ ARGBBlendRow GetARGBBlend() {
ARGBBlendRow = ARGBBlendRow_NEON;
}
#endif
-#if defined(HAS_ARGBBLENDROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBBlendRow = ARGBBlendRow_MSA;
- }
-#endif
#if defined(HAS_ARGBBLENDROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ARGBBlendRow = ARGBBlendRow_MMI;
}
+#endif
+#if defined(HAS_ARGBBLENDROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBBlendRow = ARGBBlendRow_MSA;
+ }
#endif
return ARGBBlendRow;
}
@@ -1517,14 +1667,6 @@ int ARGBMultiply(const uint8_t* src_argb0,
}
}
#endif
-#if defined(HAS_ARGBMULTIPLYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBMultiplyRow = ARGBMultiplyRow_Any_MSA;
- if (IS_ALIGNED(width, 4)) {
- ARGBMultiplyRow = ARGBMultiplyRow_MSA;
- }
- }
-#endif
#if defined(HAS_ARGBMULTIPLYROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ARGBMultiplyRow = ARGBMultiplyRow_Any_MMI;
@@ -1533,6 +1675,14 @@ int ARGBMultiply(const uint8_t* src_argb0,
}
}
#endif
+#if defined(HAS_ARGBMULTIPLYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_Any_MSA;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_MSA;
+ }
+ }
+#endif
// Multiply plane
for (y = 0; y < height; ++y) {
@@ -1602,14 +1752,6 @@ int ARGBAdd(const uint8_t* src_argb0,
}
}
#endif
-#if defined(HAS_ARGBADDROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBAddRow = ARGBAddRow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- ARGBAddRow = ARGBAddRow_MSA;
- }
- }
-#endif
#if defined(HAS_ARGBADDROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ARGBAddRow = ARGBAddRow_Any_MMI;
@@ -1618,6 +1760,14 @@ int ARGBAdd(const uint8_t* src_argb0,
}
}
#endif
+#if defined(HAS_ARGBADDROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBAddRow = ARGBAddRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAddRow = ARGBAddRow_MSA;
+ }
+ }
+#endif
// Add plane
for (y = 0; y < height; ++y) {
@@ -1682,14 +1832,6 @@ int ARGBSubtract(const uint8_t* src_argb0,
}
}
#endif
-#if defined(HAS_ARGBSUBTRACTROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBSubtractRow = ARGBSubtractRow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- ARGBSubtractRow = ARGBSubtractRow_MSA;
- }
- }
-#endif
#if defined(HAS_ARGBSUBTRACTROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ARGBSubtractRow = ARGBSubtractRow_Any_MMI;
@@ -1698,6 +1840,14 @@ int ARGBSubtract(const uint8_t* src_argb0,
}
}
#endif
+#if defined(HAS_ARGBSUBTRACTROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBSubtractRow = ARGBSubtractRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBSubtractRow = ARGBSubtractRow_MSA;
+ }
+ }
+#endif
// Subtract plane
for (y = 0; y < height; ++y) {
@@ -1708,193 +1858,6 @@ int ARGBSubtract(const uint8_t* src_argb0,
}
return 0;
}
-// Convert I422 to RGBA with matrix
-static int I422ToRGBAMatrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgba,
- int dst_stride_rgba,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
- int y;
- void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf,
- const uint8_t* v_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) =
- I422ToRGBARow_C;
- if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
- dst_stride_rgba = -dst_stride_rgba;
- }
-#if defined(HAS_I422TORGBAROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGBARow = I422ToRGBARow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I422TORGBAROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToRGBARow = I422ToRGBARow_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I422ToRGBARow = I422ToRGBARow_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TORGBAROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToRGBARow = I422ToRGBARow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGBARow = I422ToRGBARow_NEON;
- }
- }
-#endif
-#if defined(HAS_I422TORGBAROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToRGBARow = I422ToRGBARow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGBARow = I422ToRGBARow_MSA;
- }
- }
-#endif
-#if defined(HAS_I422TORGBAROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- I422ToRGBARow = I422ToRGBARow_Any_MMI;
- if (IS_ALIGNED(width, 4)) {
- I422ToRGBARow = I422ToRGBARow_MMI;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
- dst_rgba += dst_stride_rgba;
- src_y += src_stride_y;
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- return 0;
-}
-
-// Convert I422 to RGBA.
-LIBYUV_API
-int I422ToRGBA(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgba,
- int dst_stride_rgba,
- int width,
- int height) {
- return I422ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_rgba, dst_stride_rgba,
- &kYuvI601Constants, width, height);
-}
-
-// Convert I422 to BGRA.
-LIBYUV_API
-int I422ToBGRA(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_bgra,
- int dst_stride_bgra,
- int width,
- int height) {
- return I422ToRGBAMatrix(src_y, src_stride_y, src_v,
- src_stride_v, // Swap U and V
- src_u, src_stride_u, dst_bgra, dst_stride_bgra,
- &kYvuI601Constants, // Use Yvu matrix
- width, height);
-}
-
-// Convert NV12 to RGB565.
-LIBYUV_API
-int NV12ToRGB565(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_uv,
- int src_stride_uv,
- uint8_t* dst_rgb565,
- int dst_stride_rgb565,
- int width,
- int height) {
- int y;
- void (*NV12ToRGB565Row)(
- const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) = NV12ToRGB565Row_C;
- if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
- dst_stride_rgb565 = -dst_stride_rgb565;
- }
-#if defined(HAS_NV12TORGB565ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- NV12ToRGB565Row = NV12ToRGB565Row_SSSE3;
- }
- }
-#endif
-#if defined(HAS_NV12TORGB565ROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- NV12ToRGB565Row = NV12ToRGB565Row_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- NV12ToRGB565Row = NV12ToRGB565Row_AVX2;
- }
- }
-#endif
-#if defined(HAS_NV12TORGB565ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- NV12ToRGB565Row = NV12ToRGB565Row_NEON;
- }
- }
-#endif
-#if defined(HAS_NV12TORGB565ROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- NV12ToRGB565Row = NV12ToRGB565Row_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- NV12ToRGB565Row = NV12ToRGB565Row_MSA;
- }
- }
-#endif
-#if defined(HAS_NV12TORGB565ROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- NV12ToRGB565Row = NV12ToRGB565Row_Any_MMI;
- if (IS_ALIGNED(width, 4)) {
- NV12ToRGB565Row = NV12ToRGB565Row_MMI;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- NV12ToRGB565Row(src_y, src_uv, dst_rgb565, &kYuvI601Constants, width);
- dst_rgb565 += dst_stride_rgb565;
- src_y += src_stride_y;
- if (y & 1) {
- src_uv += src_stride_uv;
- }
- }
- return 0;
-}
// Convert RAW to RGB24.
LIBYUV_API
@@ -1938,14 +1901,6 @@ int RAWToRGB24(const uint8_t* src_raw,
}
}
#endif
-#if defined(HAS_RAWTORGB24ROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- RAWToRGB24Row = RAWToRGB24Row_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- RAWToRGB24Row = RAWToRGB24Row_MSA;
- }
- }
-#endif
#if defined(HAS_RAWTORGB24ROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
RAWToRGB24Row = RAWToRGB24Row_Any_MMI;
@@ -1954,6 +1909,14 @@ int RAWToRGB24(const uint8_t* src_raw,
}
}
#endif
+#if defined(HAS_RAWTORGB24ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RAWToRGB24Row = RAWToRGB24Row_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToRGB24Row = RAWToRGB24Row_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
RAWToRGB24Row(src_raw, dst_rgb24, width);
@@ -2089,14 +2052,6 @@ int ARGBRect(uint8_t* dst_argb,
ARGBSetRow = ARGBSetRow_X86;
}
#endif
-#if defined(HAS_ARGBSETROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBSetRow = ARGBSetRow_Any_MSA;
- if (IS_ALIGNED(width, 4)) {
- ARGBSetRow = ARGBSetRow_MSA;
- }
- }
-#endif
#if defined(HAS_ARGBSETROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ARGBSetRow = ARGBSetRow_Any_MMI;
@@ -2105,6 +2060,14 @@ int ARGBRect(uint8_t* dst_argb,
}
}
#endif
+#if defined(HAS_ARGBSETROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBSetRow = ARGBSetRow_Any_MSA;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBSetRow = ARGBSetRow_MSA;
+ }
+ }
+#endif
// Set plane
for (y = 0; y < height; ++y) {
@@ -2175,14 +2138,6 @@ int ARGBAttenuate(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBATTENUATEROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- ARGBAttenuateRow = ARGBAttenuateRow_MSA;
- }
- }
-#endif
#if defined(HAS_ARGBATTENUATEROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI;
@@ -2191,6 +2146,14 @@ int ARGBAttenuate(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBAttenuateRow(src_argb, dst_argb, width);
@@ -2286,16 +2249,16 @@ int ARGBGrayTo(const uint8_t* src_argb,
ARGBGrayRow = ARGBGrayRow_NEON;
}
#endif
-#if defined(HAS_ARGBGRAYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
- ARGBGrayRow = ARGBGrayRow_MSA;
- }
-#endif
#if defined(HAS_ARGBGRAYROW_MMI)
if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
ARGBGrayRow = ARGBGrayRow_MMI;
}
#endif
+#if defined(HAS_ARGBGRAYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
+ ARGBGrayRow = ARGBGrayRow_MSA;
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBGrayRow(src_argb, dst_argb, width);
@@ -2336,16 +2299,16 @@ int ARGBGray(uint8_t* dst_argb,
ARGBGrayRow = ARGBGrayRow_NEON;
}
#endif
-#if defined(HAS_ARGBGRAYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
- ARGBGrayRow = ARGBGrayRow_MSA;
- }
-#endif
#if defined(HAS_ARGBGRAYROW_MMI)
if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
ARGBGrayRow = ARGBGrayRow_MMI;
}
#endif
+#if defined(HAS_ARGBGRAYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
+ ARGBGrayRow = ARGBGrayRow_MSA;
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBGrayRow(dst, dst, width);
@@ -2384,16 +2347,16 @@ int ARGBSepia(uint8_t* dst_argb,
ARGBSepiaRow = ARGBSepiaRow_NEON;
}
#endif
-#if defined(HAS_ARGBSEPIAROW_MSA)
- if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
- ARGBSepiaRow = ARGBSepiaRow_MSA;
- }
-#endif
#if defined(HAS_ARGBSEPIAROW_MMI)
if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
ARGBSepiaRow = ARGBSepiaRow_MMI;
}
#endif
+#if defined(HAS_ARGBSEPIAROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
+ ARGBSepiaRow = ARGBSepiaRow_MSA;
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBSepiaRow(dst, width);
@@ -2440,15 +2403,15 @@ int ARGBColorMatrix(const uint8_t* src_argb,
ARGBColorMatrixRow = ARGBColorMatrixRow_NEON;
}
#endif
-#if defined(HAS_ARGBCOLORMATRIXROW_MSA)
- if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
- ARGBColorMatrixRow = ARGBColorMatrixRow_MSA;
- }
-#endif
#if defined(HAS_ARGBCOLORMATRIXROW_MMI)
if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
ARGBColorMatrixRow = ARGBColorMatrixRow_MMI;
}
+#endif
+#if defined(HAS_ARGBCOLORMATRIXROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
+ ARGBColorMatrixRow = ARGBColorMatrixRow_MSA;
+ }
#endif
for (y = 0; y < height; ++y) {
ARGBColorMatrixRow(src_argb, dst_argb, matrix_argb, width);
@@ -2814,16 +2777,16 @@ int ARGBShade(const uint8_t* src_argb,
ARGBShadeRow = ARGBShadeRow_NEON;
}
#endif
-#if defined(HAS_ARGBSHADEROW_MSA)
- if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 4)) {
- ARGBShadeRow = ARGBShadeRow_MSA;
- }
-#endif
#if defined(HAS_ARGBSHADEROW_MMI)
if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
ARGBShadeRow = ARGBShadeRow_MMI;
}
#endif
+#if defined(HAS_ARGBSHADEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 4)) {
+ ARGBShadeRow = ARGBShadeRow_MSA;
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBShadeRow(src_argb, dst_argb, width, value);
@@ -2887,14 +2850,6 @@ int InterpolatePlane(const uint8_t* src0,
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- InterpolateRow = InterpolateRow_Any_MSA;
- if (IS_ALIGNED(width, 32)) {
- InterpolateRow = InterpolateRow_MSA;
- }
- }
-#endif
#if defined(HAS_INTERPOLATEROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
InterpolateRow = InterpolateRow_Any_MMI;
@@ -2903,6 +2858,14 @@ int InterpolatePlane(const uint8_t* src0,
}
}
#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
InterpolateRow(dst, src0, src1 - src0, width, interpolation);
@@ -3018,14 +2981,6 @@ int ARGBShuffle(const uint8_t* src_bgra,
}
}
#endif
-#if defined(HAS_ARGBSHUFFLEROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBShuffleRow = ARGBShuffleRow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- ARGBShuffleRow = ARGBShuffleRow_MSA;
- }
- }
-#endif
#if defined(HAS_ARGBSHUFFLEROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ARGBShuffleRow = ARGBShuffleRow_Any_MMI;
@@ -3034,6 +2989,14 @@ int ARGBShuffle(const uint8_t* src_bgra,
}
}
#endif
+#if defined(HAS_ARGBSHUFFLEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBShuffleRow = ARGBShuffleRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBShuffleRow = ARGBShuffleRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBShuffleRow(src_bgra, dst_argb, shuffler, width);
@@ -3043,6 +3006,80 @@ int ARGBShuffle(const uint8_t* src_bgra,
return 0;
}
+// Gauss blur a float plane using Gaussian 5x5 filter with
+// coefficients of 1, 4, 6, 4, 1.
+// Each destination pixel is a blur of the 5x5
+// pixels from the source.
+// Source edges are clamped.
+// Edge is 2 pixels on each side, and interior is multiple of 4.
+LIBYUV_API
+int GaussPlane_F32(const float* src,
+ int src_stride,
+ float* dst,
+ int dst_stride,
+ int width,
+ int height) {
+ int y;
+ void (*GaussCol_F32)(const float* src0, const float* src1, const float* src2,
+ const float* src3, const float* src4, float* dst,
+ int width) = GaussCol_F32_C;
+ void (*GaussRow_F32)(const float* src, float* dst, int width) =
+ GaussRow_F32_C;
+ if (!src || !dst || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src = src + (height - 1) * src_stride;
+ src_stride = -src_stride;
+ }
+
+#if defined(HAS_GAUSSCOL_F32_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ GaussCol_F32 = GaussCol_F32_NEON;
+ }
+#endif
+#if defined(HAS_GAUSSROW_F32_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ GaussRow_F32 = GaussRow_F32_NEON;
+ }
+#endif
+ {
+ // 2 pixels on each side, but aligned out to 16 bytes.
+ align_buffer_64(rowbuf, (4 + width + 4) * 4);
+ memset(rowbuf, 0, 16);
+ memset(rowbuf + (4 + width) * 4, 0, 16);
+ float* row = (float*)(rowbuf + 16);
+ const float* src0 = src;
+ const float* src1 = src;
+ const float* src2 = src;
+ const float* src3 = src2 + ((height > 1) ? src_stride : 0);
+ const float* src4 = src3 + ((height > 2) ? src_stride : 0);
+
+ for (y = 0; y < height; ++y) {
+ GaussCol_F32(src0, src1, src2, src3, src4, row, width);
+
+ // Extrude edge by 2 floats
+ row[-2] = row[-1] = row[0];
+ row[width + 1] = row[width] = row[width - 1];
+
+ GaussRow_F32(row - 2, dst, width);
+
+ src0 = src1;
+ src1 = src2;
+ src2 = src3;
+ src3 = src4;
+ if ((y + 2) < (height - 1)) {
+ src4 += src_stride;
+ }
+ dst += dst_stride;
+ }
+ free_aligned_buffer_64(rowbuf);
+ }
+ return 0;
+}
+
// Sobel ARGB effect.
static int ARGBSobelize(const uint8_t* src_argb,
int src_stride_argb,
@@ -3097,14 +3134,6 @@ static int ARGBSobelize(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOYJROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToYJRow = ARGBToYJRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- ARGBToYJRow = ARGBToYJRow_MSA;
- }
- }
-#endif
#if defined(HAS_ARGBTOYJROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ARGBToYJRow = ARGBToYJRow_Any_MMI;
@@ -3113,6 +3142,14 @@ static int ARGBSobelize(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBTOYJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYJRow = ARGBToYJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYJRow = ARGBToYJRow_MSA;
+ }
+ }
+#endif
#if defined(HAS_SOBELYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
@@ -3124,16 +3161,16 @@ static int ARGBSobelize(const uint8_t* src_argb,
SobelYRow = SobelYRow_NEON;
}
#endif
-#if defined(HAS_SOBELYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- SobelYRow = SobelYRow_MSA;
- }
-#endif
#if defined(HAS_SOBELYROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
SobelYRow = SobelYRow_MMI;
}
#endif
+#if defined(HAS_SOBELYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ SobelYRow = SobelYRow_MSA;
+ }
+#endif
#if defined(HAS_SOBELXROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
SobelXRow = SobelXRow_SSE2;
@@ -3144,15 +3181,15 @@ static int ARGBSobelize(const uint8_t* src_argb,
SobelXRow = SobelXRow_NEON;
}
#endif
-#if defined(HAS_SOBELXROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- SobelXRow = SobelXRow_MSA;
- }
-#endif
#if defined(HAS_SOBELXROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
SobelXRow = SobelXRow_MMI;
}
+#endif
+#if defined(HAS_SOBELXROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ SobelXRow = SobelXRow_MSA;
+ }
#endif
{
// 3 rows with edges before/after.
@@ -3228,14 +3265,6 @@ int ARGBSobel(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_SOBELROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- SobelRow = SobelRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- SobelRow = SobelRow_MSA;
- }
- }
-#endif
#if defined(HAS_SOBELROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
SobelRow = SobelRow_Any_MMI;
@@ -3243,6 +3272,14 @@ int ARGBSobel(const uint8_t* src_argb,
SobelRow = SobelRow_MMI;
}
}
+#endif
+#if defined(HAS_SOBELROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ SobelRow = SobelRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ SobelRow = SobelRow_MSA;
+ }
+ }
#endif
return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
width, height, SobelRow);
@@ -3274,14 +3311,6 @@ int ARGBSobelToPlane(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_SOBELTOPLANEROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- SobelToPlaneRow = SobelToPlaneRow_Any_MSA;
- if (IS_ALIGNED(width, 32)) {
- SobelToPlaneRow = SobelToPlaneRow_MSA;
- }
- }
-#endif
#if defined(HAS_SOBELTOPLANEROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
SobelToPlaneRow = SobelToPlaneRow_Any_MMI;
@@ -3289,6 +3318,14 @@ int ARGBSobelToPlane(const uint8_t* src_argb,
SobelToPlaneRow = SobelToPlaneRow_MMI;
}
}
+#endif
+#if defined(HAS_SOBELTOPLANEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ SobelToPlaneRow = SobelToPlaneRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ SobelToPlaneRow = SobelToPlaneRow_MSA;
+ }
+ }
#endif
return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y, width,
height, SobelToPlaneRow);
@@ -3321,14 +3358,6 @@ int ARGBSobelXY(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_SOBELXYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- SobelXYRow = SobelXYRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- SobelXYRow = SobelXYRow_MSA;
- }
- }
-#endif
#if defined(HAS_SOBELXYROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
SobelXYRow = SobelXYRow_Any_MMI;
@@ -3336,6 +3365,14 @@ int ARGBSobelXY(const uint8_t* src_argb,
SobelXYRow = SobelXYRow_MMI;
}
}
+#endif
+#if defined(HAS_SOBELXYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ SobelXYRow = SobelXYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ SobelXYRow = SobelXYRow_MSA;
+ }
+ }
#endif
return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
width, height, SobelXYRow);
@@ -3634,18 +3671,18 @@ int ARGBExtractAlpha(const uint8_t* src_argb,
: ARGBExtractAlphaRow_Any_NEON;
}
#endif
-#if defined(HAS_ARGBEXTRACTALPHAROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_MSA
- : ARGBExtractAlphaRow_Any_MSA;
- }
-#endif
#if defined(HAS_ARGBEXTRACTALPHAROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_MMI
: ARGBExtractAlphaRow_Any_MMI;
}
#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_MSA
+ : ARGBExtractAlphaRow_Any_MSA;
+ }
+#endif
for (int y = 0; y < height; ++y) {
ARGBExtractAlphaRow(src_argb, dst_a, width);
@@ -3766,14 +3803,6 @@ int YUY2ToNV12(const uint8_t* src_yuy2,
}
}
#endif
-#if defined(HAS_SPLITUVROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- SplitUVRow = SplitUVRow_Any_MSA;
- if (IS_ALIGNED(width, 32)) {
- SplitUVRow = SplitUVRow_MSA;
- }
- }
-#endif
#if defined(HAS_SPLITUVROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
SplitUVRow = SplitUVRow_Any_MMI;
@@ -3782,6 +3811,14 @@ int YUY2ToNV12(const uint8_t* src_yuy2,
}
}
#endif
+#if defined(HAS_SPLITUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ SplitUVRow = SplitUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ SplitUVRow = SplitUVRow_MSA;
+ }
+ }
+#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -3806,14 +3843,6 @@ int YUY2ToNV12(const uint8_t* src_yuy2,
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- InterpolateRow = InterpolateRow_Any_MSA;
- if (IS_ALIGNED(width, 32)) {
- InterpolateRow = InterpolateRow_MSA;
- }
- }
-#endif
#if defined(HAS_INTERPOLATEROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
InterpolateRow = InterpolateRow_Any_MMI;
@@ -3822,6 +3851,14 @@ int YUY2ToNV12(const uint8_t* src_yuy2,
}
}
#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
{
int awidth = halfwidth * 2;
@@ -3898,14 +3935,6 @@ int UYVYToNV12(const uint8_t* src_uyvy,
}
}
#endif
-#if defined(HAS_SPLITUVROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- SplitUVRow = SplitUVRow_Any_MSA;
- if (IS_ALIGNED(width, 32)) {
- SplitUVRow = SplitUVRow_MSA;
- }
- }
-#endif
#if defined(HAS_SPLITUVROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
SplitUVRow = SplitUVRow_Any_MMI;
@@ -3914,6 +3943,14 @@ int UYVYToNV12(const uint8_t* src_uyvy,
}
}
#endif
+#if defined(HAS_SPLITUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ SplitUVRow = SplitUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ SplitUVRow = SplitUVRow_MSA;
+ }
+ }
+#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -3938,14 +3975,6 @@ int UYVYToNV12(const uint8_t* src_uyvy,
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- InterpolateRow = InterpolateRow_Any_MSA;
- if (IS_ALIGNED(width, 32)) {
- InterpolateRow = InterpolateRow_MSA;
- }
- }
-#endif
#if defined(HAS_INTERPOLATEROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
InterpolateRow = InterpolateRow_Any_MMI;
@@ -3954,6 +3983,14 @@ int UYVYToNV12(const uint8_t* src_uyvy,
}
}
#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
{
int awidth = halfwidth * 2;
@@ -3981,6 +4018,56 @@ int UYVYToNV12(const uint8_t* src_uyvy,
return 0;
}
+// width and height are src size allowing odd size handling.
+LIBYUV_API
+void HalfMergeUVPlane(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int y;
+ void (*HalfMergeUVRow)(const uint8_t* src_u, int src_stride_u,
+ const uint8_t* src_v, int src_stride_v,
+ uint8_t* dst_uv, int width) = HalfMergeUVRow_C;
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+#if defined(HAS_HALFMERGEUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
+ HalfMergeUVRow = HalfMergeUVRow_NEON;
+ }
+#endif
+#if defined(HAS_HALFMERGEUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
+ HalfMergeUVRow = HalfMergeUVRow_SSSE3;
+ }
+#endif
+#if defined(HAS_HALFMERGEUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
+ HalfMergeUVRow = HalfMergeUVRow_AVX2;
+ }
+#endif
+ for (y = 0; y < height - 1; y += 2) {
+ // Merge a row of U and V into a row of UV.
+ HalfMergeUVRow(src_u, src_stride_u, src_v, src_stride_v, dst_uv, width);
+ src_u += src_stride_u * 2;
+ src_v += src_stride_v * 2;
+ dst_uv += dst_stride_uv;
+ }
+ if (height & 1) {
+ HalfMergeUVRow(src_u, 0, src_v, 0, dst_uv, width);
+ }
+}
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/TMessagesProj/jni/third_party/libyuv/source/rotate.cc b/TMessagesProj/jni/third_party/libyuv/source/rotate.cc
index d414186a5..32904e473 100644
--- a/TMessagesProj/jni/third_party/libyuv/source/rotate.cc
+++ b/TMessagesProj/jni/third_party/libyuv/source/rotate.cc
@@ -36,6 +36,15 @@ void TransposePlane(const uint8_t* src,
void (*TransposeWx8)(const uint8_t* src, int src_stride, uint8_t* dst,
int dst_stride, int width) = TransposeWx8_C;
#endif
+
+#if defined(HAS_TRANSPOSEWX16_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ TransposeWx16 = TransposeWx16_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ TransposeWx16 = TransposeWx16_MSA;
+ }
+ }
+#else
#if defined(HAS_TRANSPOSEWX8_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
TransposeWx8 = TransposeWx8_NEON;
@@ -62,14 +71,7 @@ void TransposePlane(const uint8_t* src,
}
}
#endif
-#if defined(HAS_TRANSPOSEWX16_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- TransposeWx16 = TransposeWx16_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- TransposeWx16 = TransposeWx16_MSA;
- }
- }
-#endif
+#endif /* defined(HAS_TRANSPOSEWX16_MSA) */
#if defined(HAS_TRANSPOSEWX16_MSA)
// Work across the source in 16x16 tiles
@@ -142,7 +144,7 @@ void RotatePlane180(const uint8_t* src,
#if defined(HAS_MIRRORROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
MirrorRow = MirrorRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
+ if (IS_ALIGNED(width, 32)) {
MirrorRow = MirrorRow_NEON;
}
}
@@ -163,14 +165,6 @@ void RotatePlane180(const uint8_t* src,
}
}
#endif
-#if defined(HAS_MIRRORROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- MirrorRow = MirrorRow_Any_MSA;
- if (IS_ALIGNED(width, 64)) {
- MirrorRow = MirrorRow_MSA;
- }
- }
-#endif
#if defined(HAS_MIRRORROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
MirrorRow = MirrorRow_Any_MMI;
@@ -179,6 +173,14 @@ void RotatePlane180(const uint8_t* src,
}
}
#endif
+#if defined(HAS_MIRRORROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MirrorRow = MirrorRow_Any_MSA;
+ if (IS_ALIGNED(width, 64)) {
+ MirrorRow = MirrorRow_MSA;
+ }
+ }
+#endif
#if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
@@ -207,11 +209,11 @@ void RotatePlane180(const uint8_t* src,
// Odd height will harmlessly mirror the middle row twice.
for (y = 0; y < half_height; ++y) {
- MirrorRow(src, row, width); // Mirror first row into a buffer
- src += src_stride;
+ CopyRow(src, row, width); // Copy first row into buffer
MirrorRow(src_bot, dst, width); // Mirror last row into first row
+ MirrorRow(row, dst_bot, width); // Mirror buffer into last row
+ src += src_stride;
dst += dst_stride;
- CopyRow(row, dst_bot, width); // Copy first mirrored row into last
src_bot -= src_stride;
dst_bot -= dst_stride;
}
@@ -237,6 +239,15 @@ void TransposeUV(const uint8_t* src,
int dst_stride_a, uint8_t* dst_b, int dst_stride_b,
int width) = TransposeUVWx8_C;
#endif
+
+#if defined(HAS_TRANSPOSEUVWX16_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ TransposeUVWx16 = TransposeUVWx16_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ TransposeUVWx16 = TransposeUVWx16_MSA;
+ }
+ }
+#else
#if defined(HAS_TRANSPOSEUVWX8_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
TransposeUVWx8 = TransposeUVWx8_NEON;
@@ -258,14 +269,7 @@ void TransposeUV(const uint8_t* src,
}
}
#endif
-#if defined(HAS_TRANSPOSEUVWX16_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- TransposeUVWx16 = TransposeUVWx16_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- TransposeUVWx16 = TransposeUVWx16_MSA;
- }
- }
-#endif
+#endif /* defined(HAS_TRANSPOSEUVWX16_MSA) */
#if defined(HAS_TRANSPOSEUVWX16_MSA)
// Work through the source in 8x8 tiles.
@@ -340,26 +344,26 @@ void RotateUV180(const uint8_t* src,
int width,
int height) {
int i;
- void (*MirrorUVRow)(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v,
- int width) = MirrorUVRow_C;
-#if defined(HAS_MIRRORUVROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
- MirrorUVRow = MirrorUVRow_NEON;
+ void (*MirrorSplitUVRow)(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v,
+ int width) = MirrorSplitUVRow_C;
+#if defined(HAS_MIRRORSPLITUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
+ MirrorSplitUVRow = MirrorSplitUVRow_NEON;
}
#endif
-#if defined(HAS_MIRRORUVROW_SSSE3)
+#if defined(HAS_MIRRORSPLITUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
- MirrorUVRow = MirrorUVRow_SSSE3;
+ MirrorSplitUVRow = MirrorSplitUVRow_SSSE3;
}
#endif
-#if defined(HAS_MIRRORUVROW_MSA)
- if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 32)) {
- MirrorUVRow = MirrorUVRow_MSA;
- }
-#endif
-#if defined(HAS_MIRRORUVROW_MMI)
+#if defined(HAS_MIRRORSPLITUVROW_MMI)
if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 8)) {
- MirrorUVRow = MirrorUVRow_MMI;
+ MirrorSplitUVRow = MirrorSplitUVRow_MMI;
+ }
+#endif
+#if defined(HAS_MIRRORSPLITUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 32)) {
+ MirrorSplitUVRow = MirrorSplitUVRow_MSA;
}
#endif
@@ -367,7 +371,7 @@ void RotateUV180(const uint8_t* src,
dst_b += dst_stride_b * (height - 1);
for (i = 0; i < height; ++i) {
- MirrorUVRow(src, dst_a, dst_b, width);
+ MirrorSplitUVRow(src, dst_a, dst_b, width);
src += src_stride;
dst_a -= dst_stride_a;
dst_b -= dst_stride_b;
diff --git a/TMessagesProj/jni/third_party/libyuv/source/rotate_argb.cc b/TMessagesProj/jni/third_party/libyuv/source/rotate_argb.cc
index a93fd55f9..ae6538860 100644
--- a/TMessagesProj/jni/third_party/libyuv/source/rotate_argb.cc
+++ b/TMessagesProj/jni/third_party/libyuv/source/rotate_argb.cc
@@ -21,17 +21,21 @@ namespace libyuv {
extern "C" {
#endif
-static void ARGBTranspose(const uint8_t* src_argb,
- int src_stride_argb,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
+static int ARGBTranspose(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
int i;
int src_pixel_step = src_stride_argb >> 2;
void (*ScaleARGBRowDownEven)(
const uint8_t* src_argb, ptrdiff_t src_stride_argb, int src_step,
uint8_t* dst_argb, int dst_width) = ScaleARGBRowDownEven_C;
+ // Check stride is a multiple of 4.
+ if (src_stride_argb & 3) {
+ return -1;
+ }
#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_SSE2;
@@ -48,14 +52,6 @@ static void ARGBTranspose(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_SCALEARGBROWDOWNEVEN_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_MSA;
- if (IS_ALIGNED(height, 4)) { // Width of dest.
- ScaleARGBRowDownEven = ScaleARGBRowDownEven_MSA;
- }
- }
-#endif
#if defined(HAS_SCALEARGBROWDOWNEVEN_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_MMI;
@@ -64,50 +60,59 @@ static void ARGBTranspose(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_MSA;
+ if (IS_ALIGNED(height, 4)) { // Width of dest.
+ ScaleARGBRowDownEven = ScaleARGBRowDownEven_MSA;
+ }
+ }
+#endif
for (i = 0; i < width; ++i) { // column of source to row of dest.
ScaleARGBRowDownEven(src_argb, 0, src_pixel_step, dst_argb, height);
dst_argb += dst_stride_argb;
src_argb += 4;
}
+ return 0;
}
-void ARGBRotate90(const uint8_t* src_argb,
- int src_stride_argb,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
+static int ARGBRotate90(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
// Rotate by 90 is a ARGBTranspose with the source read
// from bottom to top. So set the source pointer to the end
// of the buffer and flip the sign of the source stride.
src_argb += src_stride_argb * (height - 1);
src_stride_argb = -src_stride_argb;
- ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
- height);
+ return ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width, height);
}
-void ARGBRotate270(const uint8_t* src_argb,
- int src_stride_argb,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
+static int ARGBRotate270(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
// Rotate by 270 is a ARGBTranspose with the destination written
// from bottom to top. So set the destination pointer to the end
// of the buffer and flip the sign of the destination stride.
dst_argb += dst_stride_argb * (width - 1);
dst_stride_argb = -dst_stride_argb;
- ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
- height);
+ return ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width, height);
}
-void ARGBRotate180(const uint8_t* src_argb,
- int src_stride_argb,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
+static int ARGBRotate180(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
// Swap first and last row and mirror the content. Uses a temporary row.
align_buffer_64(row, width * 4);
const uint8_t* src_bot = src_argb + src_stride_argb * (height - 1);
@@ -121,7 +126,7 @@ void ARGBRotate180(const uint8_t* src_argb,
#if defined(HAS_ARGBMIRRORROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
- if (IS_ALIGNED(width, 4)) {
+ if (IS_ALIGNED(width, 8)) {
ARGBMirrorRow = ARGBMirrorRow_NEON;
}
}
@@ -142,14 +147,6 @@ void ARGBRotate180(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBMIRRORROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBMirrorRow = ARGBMirrorRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- ARGBMirrorRow = ARGBMirrorRow_MSA;
- }
- }
-#endif
#if defined(HAS_ARGBMIRRORROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ARGBMirrorRow = ARGBMirrorRow_Any_MMI;
@@ -158,6 +155,14 @@ void ARGBRotate180(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBMIRRORROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBMirrorRow = ARGBMirrorRow_MSA;
+ }
+ }
+#endif
#if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
@@ -190,6 +195,7 @@ void ARGBRotate180(const uint8_t* src_argb,
dst_bot -= dst_stride_argb;
}
free_aligned_buffer_64(row);
+ return 0;
}
LIBYUV_API
@@ -217,17 +223,14 @@ int ARGBRotate(const uint8_t* src_argb,
return ARGBCopy(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
width, height);
case kRotate90:
- ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
- height);
- return 0;
+ return ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width, height);
case kRotate270:
- ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
- height);
- return 0;
+ return ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width, height);
case kRotate180:
- ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
- height);
- return 0;
+ return ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width, height);
default:
break;
}
diff --git a/TMessagesProj/jni/third_party/libyuv/source/rotate_gcc.cc b/TMessagesProj/jni/third_party/libyuv/source/rotate_gcc.cc
index 04e19e29e..fd359d4ae 100644
--- a/TMessagesProj/jni/third_party/libyuv/source/rotate_gcc.cc
+++ b/TMessagesProj/jni/third_party/libyuv/source/rotate_gcc.cc
@@ -31,75 +31,75 @@ void TransposeWx8_SSSE3(const uint8_t* src,
// Read in the data from the source pointer.
// First round of bit swap.
LABELALIGN
- "1: \n"
- "movq (%0),%%xmm0 \n"
- "movq (%0,%3),%%xmm1 \n"
- "lea (%0,%3,2),%0 \n"
- "punpcklbw %%xmm1,%%xmm0 \n"
- "movq (%0),%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "palignr $0x8,%%xmm1,%%xmm1 \n"
- "movq (%0,%3),%%xmm3 \n"
- "lea (%0,%3,2),%0 \n"
- "punpcklbw %%xmm3,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "movq (%0),%%xmm4 \n"
- "palignr $0x8,%%xmm3,%%xmm3 \n"
- "movq (%0,%3),%%xmm5 \n"
- "lea (%0,%3,2),%0 \n"
- "punpcklbw %%xmm5,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "movq (%0),%%xmm6 \n"
- "palignr $0x8,%%xmm5,%%xmm5 \n"
- "movq (%0,%3),%%xmm7 \n"
- "lea (%0,%3,2),%0 \n"
- "punpcklbw %%xmm7,%%xmm6 \n"
- "neg %3 \n"
- "movdqa %%xmm6,%%xmm7 \n"
- "lea 0x8(%0,%3,8),%0 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "neg %3 \n"
+ "1: \n"
+ "movq (%0),%%xmm0 \n"
+ "movq (%0,%3),%%xmm1 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "movq (%0),%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "palignr $0x8,%%xmm1,%%xmm1 \n"
+ "movq (%0,%3),%%xmm3 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "movq (%0),%%xmm4 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "movq (%0,%3),%%xmm5 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm5,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "movq (%0),%%xmm6 \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "movq (%0,%3),%%xmm7 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm7,%%xmm6 \n"
+ "neg %3 \n"
+ "movdqa %%xmm6,%%xmm7 \n"
+ "lea 0x8(%0,%3,8),%0 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "neg %3 \n"
// Second round of bit swap.
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpcklwd %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "palignr $0x8,%%xmm2,%%xmm2 \n"
- "palignr $0x8,%%xmm3,%%xmm3 \n"
- "punpcklwd %%xmm6,%%xmm4 \n"
- "punpcklwd %%xmm7,%%xmm5 \n"
- "movdqa %%xmm4,%%xmm6 \n"
- "movdqa %%xmm5,%%xmm7 \n"
- "palignr $0x8,%%xmm6,%%xmm6 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpcklwd %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "palignr $0x8,%%xmm2,%%xmm2 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "punpcklwd %%xmm6,%%xmm4 \n"
+ "punpcklwd %%xmm7,%%xmm5 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "movdqa %%xmm5,%%xmm7 \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
// Third round of bit swap.
// Write to the destination pointer.
- "punpckldq %%xmm4,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "movdqa %%xmm0,%%xmm4 \n"
- "palignr $0x8,%%xmm4,%%xmm4 \n"
- "movq %%xmm4,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm6,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "movq %%xmm2,(%1) \n"
- "palignr $0x8,%%xmm6,%%xmm6 \n"
- "punpckldq %%xmm5,%%xmm1 \n"
- "movq %%xmm6,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "movdqa %%xmm1,%%xmm5 \n"
- "movq %%xmm1,(%1) \n"
- "palignr $0x8,%%xmm5,%%xmm5 \n"
- "movq %%xmm5,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm7,%%xmm3 \n"
- "movq %%xmm3,(%1) \n"
- "movdqa %%xmm3,%%xmm7 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "sub $0x8,%2 \n"
- "movq %%xmm7,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "jg 1b \n"
+ "punpckldq %%xmm4,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "movdqa %%xmm0,%%xmm4 \n"
+ "palignr $0x8,%%xmm4,%%xmm4 \n"
+ "movq %%xmm4,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm6,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "movq %%xmm2,(%1) \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "punpckldq %%xmm5,%%xmm1 \n"
+ "movq %%xmm6,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "movdqa %%xmm1,%%xmm5 \n"
+ "movq %%xmm1,(%1) \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "movq %%xmm5,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm7,%%xmm3 \n"
+ "movq %%xmm3,(%1) \n"
+ "movdqa %%xmm3,%%xmm7 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "sub $0x8,%2 \n"
+ "movq %%xmm7,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -121,127 +121,127 @@ void TransposeWx8_Fast_SSSE3(const uint8_t* src,
// Read in the data from the source pointer.
// First round of bit swap.
LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu (%0,%3),%%xmm1 \n"
- "lea (%0,%3,2),%0 \n"
- "movdqa %%xmm0,%%xmm8 \n"
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm8 \n"
- "movdqu (%0),%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm8,%%xmm9 \n"
- "palignr $0x8,%%xmm1,%%xmm1 \n"
- "palignr $0x8,%%xmm9,%%xmm9 \n"
- "movdqu (%0,%3),%%xmm3 \n"
- "lea (%0,%3,2),%0 \n"
- "movdqa %%xmm2,%%xmm10 \n"
- "punpcklbw %%xmm3,%%xmm2 \n"
- "punpckhbw %%xmm3,%%xmm10 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "movdqa %%xmm10,%%xmm11 \n"
- "movdqu (%0),%%xmm4 \n"
- "palignr $0x8,%%xmm3,%%xmm3 \n"
- "palignr $0x8,%%xmm11,%%xmm11 \n"
- "movdqu (%0,%3),%%xmm5 \n"
- "lea (%0,%3,2),%0 \n"
- "movdqa %%xmm4,%%xmm12 \n"
- "punpcklbw %%xmm5,%%xmm4 \n"
- "punpckhbw %%xmm5,%%xmm12 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "movdqa %%xmm12,%%xmm13 \n"
- "movdqu (%0),%%xmm6 \n"
- "palignr $0x8,%%xmm5,%%xmm5 \n"
- "palignr $0x8,%%xmm13,%%xmm13 \n"
- "movdqu (%0,%3),%%xmm7 \n"
- "lea (%0,%3,2),%0 \n"
- "movdqa %%xmm6,%%xmm14 \n"
- "punpcklbw %%xmm7,%%xmm6 \n"
- "punpckhbw %%xmm7,%%xmm14 \n"
- "neg %3 \n"
- "movdqa %%xmm6,%%xmm7 \n"
- "movdqa %%xmm14,%%xmm15 \n"
- "lea 0x10(%0,%3,8),%0 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "palignr $0x8,%%xmm15,%%xmm15 \n"
- "neg %3 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%0,%3),%%xmm1 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm0,%%xmm8 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm8 \n"
+ "movdqu (%0),%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm8,%%xmm9 \n"
+ "palignr $0x8,%%xmm1,%%xmm1 \n"
+ "palignr $0x8,%%xmm9,%%xmm9 \n"
+ "movdqu (%0,%3),%%xmm3 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm2,%%xmm10 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "punpckhbw %%xmm3,%%xmm10 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "movdqa %%xmm10,%%xmm11 \n"
+ "movdqu (%0),%%xmm4 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "palignr $0x8,%%xmm11,%%xmm11 \n"
+ "movdqu (%0,%3),%%xmm5 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm4,%%xmm12 \n"
+ "punpcklbw %%xmm5,%%xmm4 \n"
+ "punpckhbw %%xmm5,%%xmm12 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "movdqa %%xmm12,%%xmm13 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "palignr $0x8,%%xmm13,%%xmm13 \n"
+ "movdqu (%0,%3),%%xmm7 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm6,%%xmm14 \n"
+ "punpcklbw %%xmm7,%%xmm6 \n"
+ "punpckhbw %%xmm7,%%xmm14 \n"
+ "neg %3 \n"
+ "movdqa %%xmm6,%%xmm7 \n"
+ "movdqa %%xmm14,%%xmm15 \n"
+ "lea 0x10(%0,%3,8),%0 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "palignr $0x8,%%xmm15,%%xmm15 \n"
+ "neg %3 \n"
// Second round of bit swap.
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpcklwd %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "palignr $0x8,%%xmm2,%%xmm2 \n"
- "palignr $0x8,%%xmm3,%%xmm3 \n"
- "punpcklwd %%xmm6,%%xmm4 \n"
- "punpcklwd %%xmm7,%%xmm5 \n"
- "movdqa %%xmm4,%%xmm6 \n"
- "movdqa %%xmm5,%%xmm7 \n"
- "palignr $0x8,%%xmm6,%%xmm6 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "punpcklwd %%xmm10,%%xmm8 \n"
- "punpcklwd %%xmm11,%%xmm9 \n"
- "movdqa %%xmm8,%%xmm10 \n"
- "movdqa %%xmm9,%%xmm11 \n"
- "palignr $0x8,%%xmm10,%%xmm10 \n"
- "palignr $0x8,%%xmm11,%%xmm11 \n"
- "punpcklwd %%xmm14,%%xmm12 \n"
- "punpcklwd %%xmm15,%%xmm13 \n"
- "movdqa %%xmm12,%%xmm14 \n"
- "movdqa %%xmm13,%%xmm15 \n"
- "palignr $0x8,%%xmm14,%%xmm14 \n"
- "palignr $0x8,%%xmm15,%%xmm15 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpcklwd %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "palignr $0x8,%%xmm2,%%xmm2 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "punpcklwd %%xmm6,%%xmm4 \n"
+ "punpcklwd %%xmm7,%%xmm5 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "movdqa %%xmm5,%%xmm7 \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "punpcklwd %%xmm10,%%xmm8 \n"
+ "punpcklwd %%xmm11,%%xmm9 \n"
+ "movdqa %%xmm8,%%xmm10 \n"
+ "movdqa %%xmm9,%%xmm11 \n"
+ "palignr $0x8,%%xmm10,%%xmm10 \n"
+ "palignr $0x8,%%xmm11,%%xmm11 \n"
+ "punpcklwd %%xmm14,%%xmm12 \n"
+ "punpcklwd %%xmm15,%%xmm13 \n"
+ "movdqa %%xmm12,%%xmm14 \n"
+ "movdqa %%xmm13,%%xmm15 \n"
+ "palignr $0x8,%%xmm14,%%xmm14 \n"
+ "palignr $0x8,%%xmm15,%%xmm15 \n"
// Third round of bit swap.
// Write to the destination pointer.
- "punpckldq %%xmm4,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "movdqa %%xmm0,%%xmm4 \n"
- "palignr $0x8,%%xmm4,%%xmm4 \n"
- "movq %%xmm4,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm6,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "movq %%xmm2,(%1) \n"
- "palignr $0x8,%%xmm6,%%xmm6 \n"
- "punpckldq %%xmm5,%%xmm1 \n"
- "movq %%xmm6,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "movdqa %%xmm1,%%xmm5 \n"
- "movq %%xmm1,(%1) \n"
- "palignr $0x8,%%xmm5,%%xmm5 \n"
- "movq %%xmm5,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm7,%%xmm3 \n"
- "movq %%xmm3,(%1) \n"
- "movdqa %%xmm3,%%xmm7 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "movq %%xmm7,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm12,%%xmm8 \n"
- "movq %%xmm8,(%1) \n"
- "movdqa %%xmm8,%%xmm12 \n"
- "palignr $0x8,%%xmm12,%%xmm12 \n"
- "movq %%xmm12,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm14,%%xmm10 \n"
- "movdqa %%xmm10,%%xmm14 \n"
- "movq %%xmm10,(%1) \n"
- "palignr $0x8,%%xmm14,%%xmm14 \n"
- "punpckldq %%xmm13,%%xmm9 \n"
- "movq %%xmm14,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "movdqa %%xmm9,%%xmm13 \n"
- "movq %%xmm9,(%1) \n"
- "palignr $0x8,%%xmm13,%%xmm13 \n"
- "movq %%xmm13,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm15,%%xmm11 \n"
- "movq %%xmm11,(%1) \n"
- "movdqa %%xmm11,%%xmm15 \n"
- "palignr $0x8,%%xmm15,%%xmm15 \n"
- "sub $0x10,%2 \n"
- "movq %%xmm15,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "jg 1b \n"
+ "punpckldq %%xmm4,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "movdqa %%xmm0,%%xmm4 \n"
+ "palignr $0x8,%%xmm4,%%xmm4 \n"
+ "movq %%xmm4,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm6,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "movq %%xmm2,(%1) \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "punpckldq %%xmm5,%%xmm1 \n"
+ "movq %%xmm6,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "movdqa %%xmm1,%%xmm5 \n"
+ "movq %%xmm1,(%1) \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "movq %%xmm5,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm7,%%xmm3 \n"
+ "movq %%xmm3,(%1) \n"
+ "movdqa %%xmm3,%%xmm7 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "movq %%xmm7,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm12,%%xmm8 \n"
+ "movq %%xmm8,(%1) \n"
+ "movdqa %%xmm8,%%xmm12 \n"
+ "palignr $0x8,%%xmm12,%%xmm12 \n"
+ "movq %%xmm12,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm14,%%xmm10 \n"
+ "movdqa %%xmm10,%%xmm14 \n"
+ "movq %%xmm10,(%1) \n"
+ "palignr $0x8,%%xmm14,%%xmm14 \n"
+ "punpckldq %%xmm13,%%xmm9 \n"
+ "movq %%xmm14,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "movdqa %%xmm9,%%xmm13 \n"
+ "movq %%xmm9,(%1) \n"
+ "palignr $0x8,%%xmm13,%%xmm13 \n"
+ "movq %%xmm13,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm15,%%xmm11 \n"
+ "movq %%xmm11,(%1) \n"
+ "movdqa %%xmm11,%%xmm15 \n"
+ "palignr $0x8,%%xmm15,%%xmm15 \n"
+ "sub $0x10,%2 \n"
+ "movq %%xmm15,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -266,95 +266,95 @@ void TransposeUVWx8_SSE2(const uint8_t* src,
// Read in the data from the source pointer.
// First round of bit swap.
LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu (%0,%4),%%xmm1 \n"
- "lea (%0,%4,2),%0 \n"
- "movdqa %%xmm0,%%xmm8 \n"
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm8 \n"
- "movdqa %%xmm8,%%xmm1 \n"
- "movdqu (%0),%%xmm2 \n"
- "movdqu (%0,%4),%%xmm3 \n"
- "lea (%0,%4,2),%0 \n"
- "movdqa %%xmm2,%%xmm8 \n"
- "punpcklbw %%xmm3,%%xmm2 \n"
- "punpckhbw %%xmm3,%%xmm8 \n"
- "movdqa %%xmm8,%%xmm3 \n"
- "movdqu (%0),%%xmm4 \n"
- "movdqu (%0,%4),%%xmm5 \n"
- "lea (%0,%4,2),%0 \n"
- "movdqa %%xmm4,%%xmm8 \n"
- "punpcklbw %%xmm5,%%xmm4 \n"
- "punpckhbw %%xmm5,%%xmm8 \n"
- "movdqa %%xmm8,%%xmm5 \n"
- "movdqu (%0),%%xmm6 \n"
- "movdqu (%0,%4),%%xmm7 \n"
- "lea (%0,%4,2),%0 \n"
- "movdqa %%xmm6,%%xmm8 \n"
- "punpcklbw %%xmm7,%%xmm6 \n"
- "neg %4 \n"
- "lea 0x10(%0,%4,8),%0 \n"
- "punpckhbw %%xmm7,%%xmm8 \n"
- "movdqa %%xmm8,%%xmm7 \n"
- "neg %4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%0,%4),%%xmm1 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm0,%%xmm8 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm1 \n"
+ "movdqu (%0),%%xmm2 \n"
+ "movdqu (%0,%4),%%xmm3 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm2,%%xmm8 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "punpckhbw %%xmm3,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm3 \n"
+ "movdqu (%0),%%xmm4 \n"
+ "movdqu (%0,%4),%%xmm5 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm4,%%xmm8 \n"
+ "punpcklbw %%xmm5,%%xmm4 \n"
+ "punpckhbw %%xmm5,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm5 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu (%0,%4),%%xmm7 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm6,%%xmm8 \n"
+ "punpcklbw %%xmm7,%%xmm6 \n"
+ "neg %4 \n"
+ "lea 0x10(%0,%4,8),%0 \n"
+ "punpckhbw %%xmm7,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm7 \n"
+ "neg %4 \n"
// Second round of bit swap.
- "movdqa %%xmm0,%%xmm8 \n"
- "movdqa %%xmm1,%%xmm9 \n"
- "punpckhwd %%xmm2,%%xmm8 \n"
- "punpckhwd %%xmm3,%%xmm9 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpcklwd %%xmm3,%%xmm1 \n"
- "movdqa %%xmm8,%%xmm2 \n"
- "movdqa %%xmm9,%%xmm3 \n"
- "movdqa %%xmm4,%%xmm8 \n"
- "movdqa %%xmm5,%%xmm9 \n"
- "punpckhwd %%xmm6,%%xmm8 \n"
- "punpckhwd %%xmm7,%%xmm9 \n"
- "punpcklwd %%xmm6,%%xmm4 \n"
- "punpcklwd %%xmm7,%%xmm5 \n"
- "movdqa %%xmm8,%%xmm6 \n"
- "movdqa %%xmm9,%%xmm7 \n"
+ "movdqa %%xmm0,%%xmm8 \n"
+ "movdqa %%xmm1,%%xmm9 \n"
+ "punpckhwd %%xmm2,%%xmm8 \n"
+ "punpckhwd %%xmm3,%%xmm9 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpcklwd %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm8,%%xmm2 \n"
+ "movdqa %%xmm9,%%xmm3 \n"
+ "movdqa %%xmm4,%%xmm8 \n"
+ "movdqa %%xmm5,%%xmm9 \n"
+ "punpckhwd %%xmm6,%%xmm8 \n"
+ "punpckhwd %%xmm7,%%xmm9 \n"
+ "punpcklwd %%xmm6,%%xmm4 \n"
+ "punpcklwd %%xmm7,%%xmm5 \n"
+ "movdqa %%xmm8,%%xmm6 \n"
+ "movdqa %%xmm9,%%xmm7 \n"
// Third round of bit swap.
// Write to the destination pointer.
- "movdqa %%xmm0,%%xmm8 \n"
- "punpckldq %%xmm4,%%xmm0 \n"
- "movlpd %%xmm0,(%1) \n" // Write back U channel
- "movhpd %%xmm0,(%2) \n" // Write back V channel
- "punpckhdq %%xmm4,%%xmm8 \n"
- "movlpd %%xmm8,(%1,%5) \n"
- "lea (%1,%5,2),%1 \n"
- "movhpd %%xmm8,(%2,%6) \n"
- "lea (%2,%6,2),%2 \n"
- "movdqa %%xmm2,%%xmm8 \n"
- "punpckldq %%xmm6,%%xmm2 \n"
- "movlpd %%xmm2,(%1) \n"
- "movhpd %%xmm2,(%2) \n"
- "punpckhdq %%xmm6,%%xmm8 \n"
- "movlpd %%xmm8,(%1,%5) \n"
- "lea (%1,%5,2),%1 \n"
- "movhpd %%xmm8,(%2,%6) \n"
- "lea (%2,%6,2),%2 \n"
- "movdqa %%xmm1,%%xmm8 \n"
- "punpckldq %%xmm5,%%xmm1 \n"
- "movlpd %%xmm1,(%1) \n"
- "movhpd %%xmm1,(%2) \n"
- "punpckhdq %%xmm5,%%xmm8 \n"
- "movlpd %%xmm8,(%1,%5) \n"
- "lea (%1,%5,2),%1 \n"
- "movhpd %%xmm8,(%2,%6) \n"
- "lea (%2,%6,2),%2 \n"
- "movdqa %%xmm3,%%xmm8 \n"
- "punpckldq %%xmm7,%%xmm3 \n"
- "movlpd %%xmm3,(%1) \n"
- "movhpd %%xmm3,(%2) \n"
- "punpckhdq %%xmm7,%%xmm8 \n"
- "sub $0x8,%3 \n"
- "movlpd %%xmm8,(%1,%5) \n"
- "lea (%1,%5,2),%1 \n"
- "movhpd %%xmm8,(%2,%6) \n"
- "lea (%2,%6,2),%2 \n"
- "jg 1b \n"
+ "movdqa %%xmm0,%%xmm8 \n"
+ "punpckldq %%xmm4,%%xmm0 \n"
+ "movlpd %%xmm0,(%1) \n" // Write back U channel
+ "movhpd %%xmm0,(%2) \n" // Write back V channel
+ "punpckhdq %%xmm4,%%xmm8 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "movdqa %%xmm2,%%xmm8 \n"
+ "punpckldq %%xmm6,%%xmm2 \n"
+ "movlpd %%xmm2,(%1) \n"
+ "movhpd %%xmm2,(%2) \n"
+ "punpckhdq %%xmm6,%%xmm8 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "movdqa %%xmm1,%%xmm8 \n"
+ "punpckldq %%xmm5,%%xmm1 \n"
+ "movlpd %%xmm1,(%1) \n"
+ "movhpd %%xmm1,(%2) \n"
+ "punpckhdq %%xmm5,%%xmm8 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "movdqa %%xmm3,%%xmm8 \n"
+ "punpckldq %%xmm7,%%xmm3 \n"
+ "movlpd %%xmm3,(%1) \n"
+ "movhpd %%xmm3,(%2) \n"
+ "punpckhdq %%xmm7,%%xmm8 \n"
+ "sub $0x8,%3 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst_a), // %1
"+r"(dst_b), // %2
diff --git a/TMessagesProj/jni/third_party/libyuv/source/rotate_neon.cc b/TMessagesProj/jni/third_party/libyuv/source/rotate_neon.cc
index fdc0dd476..844df2bf3 100644
--- a/TMessagesProj/jni/third_party/libyuv/source/rotate_neon.cc
+++ b/TMessagesProj/jni/third_party/libyuv/source/rotate_neon.cc
@@ -38,52 +38,52 @@ void TransposeWx8_NEON(const uint8_t* src,
// handle 8x8 blocks. this should be the majority of the plane
"1: \n"
- "mov %0, %1 \n"
+ "mov %0, %1 \n"
- "vld1.8 {d0}, [%0], %2 \n"
- "vld1.8 {d1}, [%0], %2 \n"
- "vld1.8 {d2}, [%0], %2 \n"
- "vld1.8 {d3}, [%0], %2 \n"
- "vld1.8 {d4}, [%0], %2 \n"
- "vld1.8 {d5}, [%0], %2 \n"
- "vld1.8 {d6}, [%0], %2 \n"
- "vld1.8 {d7}, [%0] \n"
+ "vld1.8 {d0}, [%0], %2 \n"
+ "vld1.8 {d1}, [%0], %2 \n"
+ "vld1.8 {d2}, [%0], %2 \n"
+ "vld1.8 {d3}, [%0], %2 \n"
+ "vld1.8 {d4}, [%0], %2 \n"
+ "vld1.8 {d5}, [%0], %2 \n"
+ "vld1.8 {d6}, [%0], %2 \n"
+ "vld1.8 {d7}, [%0] \n"
- "vtrn.8 d1, d0 \n"
- "vtrn.8 d3, d2 \n"
- "vtrn.8 d5, d4 \n"
- "vtrn.8 d7, d6 \n"
+ "vtrn.8 d1, d0 \n"
+ "vtrn.8 d3, d2 \n"
+ "vtrn.8 d5, d4 \n"
+ "vtrn.8 d7, d6 \n"
- "vtrn.16 d1, d3 \n"
- "vtrn.16 d0, d2 \n"
- "vtrn.16 d5, d7 \n"
- "vtrn.16 d4, d6 \n"
+ "vtrn.16 d1, d3 \n"
+ "vtrn.16 d0, d2 \n"
+ "vtrn.16 d5, d7 \n"
+ "vtrn.16 d4, d6 \n"
- "vtrn.32 d1, d5 \n"
- "vtrn.32 d0, d4 \n"
- "vtrn.32 d3, d7 \n"
- "vtrn.32 d2, d6 \n"
+ "vtrn.32 d1, d5 \n"
+ "vtrn.32 d0, d4 \n"
+ "vtrn.32 d3, d7 \n"
+ "vtrn.32 d2, d6 \n"
- "vrev16.8 q0, q0 \n"
- "vrev16.8 q1, q1 \n"
- "vrev16.8 q2, q2 \n"
- "vrev16.8 q3, q3 \n"
+ "vrev16.8 q0, q0 \n"
+ "vrev16.8 q1, q1 \n"
+ "vrev16.8 q2, q2 \n"
+ "vrev16.8 q3, q3 \n"
- "mov %0, %3 \n"
+ "mov %0, %3 \n"
- "vst1.8 {d1}, [%0], %4 \n"
- "vst1.8 {d0}, [%0], %4 \n"
- "vst1.8 {d3}, [%0], %4 \n"
- "vst1.8 {d2}, [%0], %4 \n"
- "vst1.8 {d5}, [%0], %4 \n"
- "vst1.8 {d4}, [%0], %4 \n"
- "vst1.8 {d7}, [%0], %4 \n"
- "vst1.8 {d6}, [%0] \n"
+ "vst1.8 {d1}, [%0], %4 \n"
+ "vst1.8 {d0}, [%0], %4 \n"
+ "vst1.8 {d3}, [%0], %4 \n"
+ "vst1.8 {d2}, [%0], %4 \n"
+ "vst1.8 {d5}, [%0], %4 \n"
+ "vst1.8 {d4}, [%0], %4 \n"
+ "vst1.8 {d7}, [%0], %4 \n"
+ "vst1.8 {d6}, [%0] \n"
- "add %1, #8 \n" // src += 8
- "add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride
- "subs %5, #8 \n" // w -= 8
- "bge 1b \n"
+ "add %1, #8 \n" // src += 8
+ "add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride
+ "subs %5, #8 \n" // w -= 8
+ "bge 1b \n"
// add 8 back to counter. if the result is 0 there are
// no residuals.
@@ -208,68 +208,70 @@ void TransposeUVWx8_NEON(const uint8_t* src,
// handle 8x8 blocks. this should be the majority of the plane
"1: \n"
- "mov %0, %1 \n"
+ "mov %0, %1 \n"
- "vld2.8 {d0, d1}, [%0], %2 \n"
- "vld2.8 {d2, d3}, [%0], %2 \n"
- "vld2.8 {d4, d5}, [%0], %2 \n"
- "vld2.8 {d6, d7}, [%0], %2 \n"
- "vld2.8 {d16, d17}, [%0], %2 \n"
- "vld2.8 {d18, d19}, [%0], %2 \n"
- "vld2.8 {d20, d21}, [%0], %2 \n"
- "vld2.8 {d22, d23}, [%0] \n"
+ "vld2.8 {d0, d1}, [%0], %2 \n"
+ "vld2.8 {d2, d3}, [%0], %2 \n"
+ "vld2.8 {d4, d5}, [%0], %2 \n"
+ "vld2.8 {d6, d7}, [%0], %2 \n"
+ "vld2.8 {d16, d17}, [%0], %2 \n"
+ "vld2.8 {d18, d19}, [%0], %2 \n"
+ "vld2.8 {d20, d21}, [%0], %2 \n"
+ "vld2.8 {d22, d23}, [%0] \n"
- "vtrn.8 q1, q0 \n"
- "vtrn.8 q3, q2 \n"
- "vtrn.8 q9, q8 \n"
- "vtrn.8 q11, q10 \n"
+ "vtrn.8 q1, q0 \n"
+ "vtrn.8 q3, q2 \n"
+ "vtrn.8 q9, q8 \n"
+ "vtrn.8 q11, q10 \n"
- "vtrn.16 q1, q3 \n"
- "vtrn.16 q0, q2 \n"
- "vtrn.16 q9, q11 \n"
- "vtrn.16 q8, q10 \n"
+ "vtrn.16 q1, q3 \n"
+ "vtrn.16 q0, q2 \n"
+ "vtrn.16 q9, q11 \n"
+ "vtrn.16 q8, q10 \n"
- "vtrn.32 q1, q9 \n"
- "vtrn.32 q0, q8 \n"
- "vtrn.32 q3, q11 \n"
- "vtrn.32 q2, q10 \n"
+ "vtrn.32 q1, q9 \n"
+ "vtrn.32 q0, q8 \n"
+ "vtrn.32 q3, q11 \n"
+ "vtrn.32 q2, q10 \n"
- "vrev16.8 q0, q0 \n"
- "vrev16.8 q1, q1 \n"
- "vrev16.8 q2, q2 \n"
- "vrev16.8 q3, q3 \n"
- "vrev16.8 q8, q8 \n"
- "vrev16.8 q9, q9 \n"
- "vrev16.8 q10, q10 \n"
- "vrev16.8 q11, q11 \n"
+ "vrev16.8 q0, q0 \n"
+ "vrev16.8 q1, q1 \n"
+ "vrev16.8 q2, q2 \n"
+ "vrev16.8 q3, q3 \n"
+ "vrev16.8 q8, q8 \n"
+ "vrev16.8 q9, q9 \n"
+ "vrev16.8 q10, q10 \n"
+ "vrev16.8 q11, q11 \n"
- "mov %0, %3 \n"
+ "mov %0, %3 \n"
- "vst1.8 {d2}, [%0], %4 \n"
- "vst1.8 {d0}, [%0], %4 \n"
- "vst1.8 {d6}, [%0], %4 \n"
- "vst1.8 {d4}, [%0], %4 \n"
- "vst1.8 {d18}, [%0], %4 \n"
- "vst1.8 {d16}, [%0], %4 \n"
- "vst1.8 {d22}, [%0], %4 \n"
- "vst1.8 {d20}, [%0] \n"
+ "vst1.8 {d2}, [%0], %4 \n"
+ "vst1.8 {d0}, [%0], %4 \n"
+ "vst1.8 {d6}, [%0], %4 \n"
+ "vst1.8 {d4}, [%0], %4 \n"
+ "vst1.8 {d18}, [%0], %4 \n"
+ "vst1.8 {d16}, [%0], %4 \n"
+ "vst1.8 {d22}, [%0], %4 \n"
+ "vst1.8 {d20}, [%0] \n"
- "mov %0, %5 \n"
+ "mov %0, %5 \n"
- "vst1.8 {d3}, [%0], %6 \n"
- "vst1.8 {d1}, [%0], %6 \n"
- "vst1.8 {d7}, [%0], %6 \n"
- "vst1.8 {d5}, [%0], %6 \n"
- "vst1.8 {d19}, [%0], %6 \n"
- "vst1.8 {d17}, [%0], %6 \n"
- "vst1.8 {d23}, [%0], %6 \n"
- "vst1.8 {d21}, [%0] \n"
+ "vst1.8 {d3}, [%0], %6 \n"
+ "vst1.8 {d1}, [%0], %6 \n"
+ "vst1.8 {d7}, [%0], %6 \n"
+ "vst1.8 {d5}, [%0], %6 \n"
+ "vst1.8 {d19}, [%0], %6 \n"
+ "vst1.8 {d17}, [%0], %6 \n"
+ "vst1.8 {d23}, [%0], %6 \n"
+ "vst1.8 {d21}, [%0] \n"
- "add %1, #8*2 \n" // src += 8*2
- "add %3, %3, %4, lsl #3 \n" // dst_a += 8 * dst_stride_a
- "add %5, %5, %6, lsl #3 \n" // dst_b += 8 * dst_stride_b
- "subs %7, #8 \n" // w -= 8
- "bge 1b \n"
+ "add %1, #8*2 \n" // src += 8*2
+ "add %3, %3, %4, lsl #3 \n" // dst_a += 8 *
+ // dst_stride_a
+ "add %5, %5, %6, lsl #3 \n" // dst_b += 8 *
+ // dst_stride_b
+ "subs %7, #8 \n" // w -= 8
+ "bge 1b \n"
// add 8 back to counter. if the result is 0 there are
// no residuals.
diff --git a/TMessagesProj/jni/third_party/libyuv/source/rotate_neon64.cc b/TMessagesProj/jni/third_party/libyuv/source/rotate_neon64.cc
index f469baacf..43c158173 100644
--- a/TMessagesProj/jni/third_party/libyuv/source/rotate_neon64.cc
+++ b/TMessagesProj/jni/third_party/libyuv/source/rotate_neon64.cc
@@ -34,58 +34,74 @@ void TransposeWx8_NEON(const uint8_t* src,
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
- "sub %w3, %w3, #8 \n"
+ "sub %w3, %w3, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
- "1: \n"
+ "1: \n"
"mov %0, %1 \n"
- "ld1 {v0.8b}, [%0], %5 \n"
- "ld1 {v1.8b}, [%0], %5 \n"
- "ld1 {v2.8b}, [%0], %5 \n"
- "ld1 {v3.8b}, [%0], %5 \n"
- "ld1 {v4.8b}, [%0], %5 \n"
- "ld1 {v5.8b}, [%0], %5 \n"
- "ld1 {v6.8b}, [%0], %5 \n"
- "ld1 {v7.8b}, [%0] \n"
+ "ld1 {v0.8b}, [%0], %5 \n"
+ "ld1 {v1.8b}, [%0], %5 \n"
+ "ld1 {v2.8b}, [%0], %5 \n"
+ "ld1 {v3.8b}, [%0], %5 \n"
+ "ld1 {v4.8b}, [%0], %5 \n"
+ "ld1 {v5.8b}, [%0], %5 \n"
+ "ld1 {v6.8b}, [%0], %5 \n"
+ "ld1 {v7.8b}, [%0] \n"
+ "mov %0, %1 \n"
- "trn2 v16.8b, v0.8b, v1.8b \n"
- "trn1 v17.8b, v0.8b, v1.8b \n"
- "trn2 v18.8b, v2.8b, v3.8b \n"
- "trn1 v19.8b, v2.8b, v3.8b \n"
- "trn2 v20.8b, v4.8b, v5.8b \n"
- "trn1 v21.8b, v4.8b, v5.8b \n"
- "trn2 v22.8b, v6.8b, v7.8b \n"
- "trn1 v23.8b, v6.8b, v7.8b \n"
+ "trn2 v16.8b, v0.8b, v1.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "trn1 v17.8b, v0.8b, v1.8b \n"
+ "add %0, %0, %5 \n"
+ "trn2 v18.8b, v2.8b, v3.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 1
+ "trn1 v19.8b, v2.8b, v3.8b \n"
+ "add %0, %0, %5 \n"
+ "trn2 v20.8b, v4.8b, v5.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 2
+ "trn1 v21.8b, v4.8b, v5.8b \n"
+ "add %0, %0, %5 \n"
+ "trn2 v22.8b, v6.8b, v7.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 3
+ "trn1 v23.8b, v6.8b, v7.8b \n"
+ "add %0, %0, %5 \n"
- "trn2 v3.4h, v17.4h, v19.4h \n"
- "trn1 v1.4h, v17.4h, v19.4h \n"
- "trn2 v2.4h, v16.4h, v18.4h \n"
- "trn1 v0.4h, v16.4h, v18.4h \n"
- "trn2 v7.4h, v21.4h, v23.4h \n"
- "trn1 v5.4h, v21.4h, v23.4h \n"
- "trn2 v6.4h, v20.4h, v22.4h \n"
- "trn1 v4.4h, v20.4h, v22.4h \n"
+ "trn2 v3.4h, v17.4h, v19.4h \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 4
+ "trn1 v1.4h, v17.4h, v19.4h \n"
+ "add %0, %0, %5 \n"
+ "trn2 v2.4h, v16.4h, v18.4h \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 5
+ "trn1 v0.4h, v16.4h, v18.4h \n"
+ "add %0, %0, %5 \n"
+ "trn2 v7.4h, v21.4h, v23.4h \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 6
+ "trn1 v5.4h, v21.4h, v23.4h \n"
+ "add %0, %0, %5 \n"
+ "trn2 v6.4h, v20.4h, v22.4h \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 7
+ "trn1 v4.4h, v20.4h, v22.4h \n"
- "trn2 v21.2s, v1.2s, v5.2s \n"
- "trn1 v17.2s, v1.2s, v5.2s \n"
- "trn2 v20.2s, v0.2s, v4.2s \n"
- "trn1 v16.2s, v0.2s, v4.2s \n"
- "trn2 v23.2s, v3.2s, v7.2s \n"
- "trn1 v19.2s, v3.2s, v7.2s \n"
- "trn2 v22.2s, v2.2s, v6.2s \n"
- "trn1 v18.2s, v2.2s, v6.2s \n"
+ "trn2 v21.2s, v1.2s, v5.2s \n"
+ "trn1 v17.2s, v1.2s, v5.2s \n"
+ "trn2 v20.2s, v0.2s, v4.2s \n"
+ "trn1 v16.2s, v0.2s, v4.2s \n"
+ "trn2 v23.2s, v3.2s, v7.2s \n"
+ "trn1 v19.2s, v3.2s, v7.2s \n"
+ "trn2 v22.2s, v2.2s, v6.2s \n"
+ "trn1 v18.2s, v2.2s, v6.2s \n"
"mov %0, %2 \n"
- "st1 {v17.8b}, [%0], %6 \n"
- "st1 {v16.8b}, [%0], %6 \n"
- "st1 {v19.8b}, [%0], %6 \n"
- "st1 {v18.8b}, [%0], %6 \n"
- "st1 {v21.8b}, [%0], %6 \n"
- "st1 {v20.8b}, [%0], %6 \n"
- "st1 {v23.8b}, [%0], %6 \n"
- "st1 {v22.8b}, [%0] \n"
+ "st1 {v17.8b}, [%0], %6 \n"
+ "st1 {v16.8b}, [%0], %6 \n"
+ "st1 {v19.8b}, [%0], %6 \n"
+ "st1 {v18.8b}, [%0], %6 \n"
+ "st1 {v21.8b}, [%0], %6 \n"
+ "st1 {v20.8b}, [%0], %6 \n"
+ "st1 {v23.8b}, [%0], %6 \n"
+ "st1 {v22.8b}, [%0] \n"
"add %1, %1, #8 \n" // src += 8
"add %2, %2, %6, lsl #3 \n" // dst += 8 * dst_stride
@@ -94,33 +110,33 @@ void TransposeWx8_NEON(const uint8_t* src,
// add 8 back to counter. if the result is 0 there are
// no residuals.
- "adds %w3, %w3, #8 \n"
- "b.eq 4f \n"
+ "adds %w3, %w3, #8 \n"
+ "b.eq 4f \n"
// some residual, so between 1 and 7 lines left to transpose
- "cmp %w3, #2 \n"
- "b.lt 3f \n"
+ "cmp %w3, #2 \n"
+ "b.lt 3f \n"
- "cmp %w3, #4 \n"
- "b.lt 2f \n"
+ "cmp %w3, #4 \n"
+ "b.lt 2f \n"
// 4x8 block
- "mov %0, %1 \n"
- "ld1 {v0.s}[0], [%0], %5 \n"
- "ld1 {v0.s}[1], [%0], %5 \n"
- "ld1 {v0.s}[2], [%0], %5 \n"
- "ld1 {v0.s}[3], [%0], %5 \n"
- "ld1 {v1.s}[0], [%0], %5 \n"
- "ld1 {v1.s}[1], [%0], %5 \n"
- "ld1 {v1.s}[2], [%0], %5 \n"
- "ld1 {v1.s}[3], [%0] \n"
+ "mov %0, %1 \n"
+ "ld1 {v0.s}[0], [%0], %5 \n"
+ "ld1 {v0.s}[1], [%0], %5 \n"
+ "ld1 {v0.s}[2], [%0], %5 \n"
+ "ld1 {v0.s}[3], [%0], %5 \n"
+ "ld1 {v1.s}[0], [%0], %5 \n"
+ "ld1 {v1.s}[1], [%0], %5 \n"
+ "ld1 {v1.s}[2], [%0], %5 \n"
+ "ld1 {v1.s}[3], [%0] \n"
- "mov %0, %2 \n"
+ "mov %0, %2 \n"
- "ld1 {v2.16b}, [%4] \n"
+ "ld1 {v2.16b}, [%4] \n"
- "tbl v3.16b, {v0.16b}, v2.16b \n"
- "tbl v0.16b, {v1.16b}, v2.16b \n"
+ "tbl v3.16b, {v0.16b}, v2.16b \n"
+ "tbl v0.16b, {v1.16b}, v2.16b \n"
// TODO(frkoenig): Rework shuffle above to
// write out with 4 instead of 8 writes.
@@ -212,89 +228,90 @@ void TransposeUVWx8_NEON(const uint8_t* src,
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
- "sub %w4, %w4, #8 \n"
+ "sub %w4, %w4, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
"1: \n"
- "mov %0, %1 \n"
+ "mov %0, %1 \n"
- "ld1 {v0.16b}, [%0], %5 \n"
- "ld1 {v1.16b}, [%0], %5 \n"
- "ld1 {v2.16b}, [%0], %5 \n"
- "ld1 {v3.16b}, [%0], %5 \n"
- "ld1 {v4.16b}, [%0], %5 \n"
- "ld1 {v5.16b}, [%0], %5 \n"
- "ld1 {v6.16b}, [%0], %5 \n"
- "ld1 {v7.16b}, [%0] \n"
+ "ld1 {v0.16b}, [%0], %5 \n"
+ "ld1 {v1.16b}, [%0], %5 \n"
+ "ld1 {v2.16b}, [%0], %5 \n"
+ "ld1 {v3.16b}, [%0], %5 \n"
+ "ld1 {v4.16b}, [%0], %5 \n"
+ "ld1 {v5.16b}, [%0], %5 \n"
+ "ld1 {v6.16b}, [%0], %5 \n"
+ "ld1 {v7.16b}, [%0] \n"
+ "mov %0, %1 \n"
- "trn1 v16.16b, v0.16b, v1.16b \n"
- "trn2 v17.16b, v0.16b, v1.16b \n"
- "trn1 v18.16b, v2.16b, v3.16b \n"
- "trn2 v19.16b, v2.16b, v3.16b \n"
- "trn1 v20.16b, v4.16b, v5.16b \n"
- "trn2 v21.16b, v4.16b, v5.16b \n"
- "trn1 v22.16b, v6.16b, v7.16b \n"
- "trn2 v23.16b, v6.16b, v7.16b \n"
+ "trn1 v16.16b, v0.16b, v1.16b \n"
+ "trn2 v17.16b, v0.16b, v1.16b \n"
+ "trn1 v18.16b, v2.16b, v3.16b \n"
+ "trn2 v19.16b, v2.16b, v3.16b \n"
+ "trn1 v20.16b, v4.16b, v5.16b \n"
+ "trn2 v21.16b, v4.16b, v5.16b \n"
+ "trn1 v22.16b, v6.16b, v7.16b \n"
+ "trn2 v23.16b, v6.16b, v7.16b \n"
- "trn1 v0.8h, v16.8h, v18.8h \n"
- "trn2 v1.8h, v16.8h, v18.8h \n"
- "trn1 v2.8h, v20.8h, v22.8h \n"
- "trn2 v3.8h, v20.8h, v22.8h \n"
- "trn1 v4.8h, v17.8h, v19.8h \n"
- "trn2 v5.8h, v17.8h, v19.8h \n"
- "trn1 v6.8h, v21.8h, v23.8h \n"
- "trn2 v7.8h, v21.8h, v23.8h \n"
+ "trn1 v0.8h, v16.8h, v18.8h \n"
+ "trn2 v1.8h, v16.8h, v18.8h \n"
+ "trn1 v2.8h, v20.8h, v22.8h \n"
+ "trn2 v3.8h, v20.8h, v22.8h \n"
+ "trn1 v4.8h, v17.8h, v19.8h \n"
+ "trn2 v5.8h, v17.8h, v19.8h \n"
+ "trn1 v6.8h, v21.8h, v23.8h \n"
+ "trn2 v7.8h, v21.8h, v23.8h \n"
- "trn1 v16.4s, v0.4s, v2.4s \n"
- "trn2 v17.4s, v0.4s, v2.4s \n"
- "trn1 v18.4s, v1.4s, v3.4s \n"
- "trn2 v19.4s, v1.4s, v3.4s \n"
- "trn1 v20.4s, v4.4s, v6.4s \n"
- "trn2 v21.4s, v4.4s, v6.4s \n"
- "trn1 v22.4s, v5.4s, v7.4s \n"
- "trn2 v23.4s, v5.4s, v7.4s \n"
+ "trn1 v16.4s, v0.4s, v2.4s \n"
+ "trn2 v17.4s, v0.4s, v2.4s \n"
+ "trn1 v18.4s, v1.4s, v3.4s \n"
+ "trn2 v19.4s, v1.4s, v3.4s \n"
+ "trn1 v20.4s, v4.4s, v6.4s \n"
+ "trn2 v21.4s, v4.4s, v6.4s \n"
+ "trn1 v22.4s, v5.4s, v7.4s \n"
+ "trn2 v23.4s, v5.4s, v7.4s \n"
- "mov %0, %2 \n"
+ "mov %0, %2 \n"
- "st1 {v16.d}[0], [%0], %6 \n"
- "st1 {v18.d}[0], [%0], %6 \n"
- "st1 {v17.d}[0], [%0], %6 \n"
- "st1 {v19.d}[0], [%0], %6 \n"
- "st1 {v16.d}[1], [%0], %6 \n"
- "st1 {v18.d}[1], [%0], %6 \n"
- "st1 {v17.d}[1], [%0], %6 \n"
- "st1 {v19.d}[1], [%0] \n"
+ "st1 {v16.d}[0], [%0], %6 \n"
+ "st1 {v18.d}[0], [%0], %6 \n"
+ "st1 {v17.d}[0], [%0], %6 \n"
+ "st1 {v19.d}[0], [%0], %6 \n"
+ "st1 {v16.d}[1], [%0], %6 \n"
+ "st1 {v18.d}[1], [%0], %6 \n"
+ "st1 {v17.d}[1], [%0], %6 \n"
+ "st1 {v19.d}[1], [%0] \n"
- "mov %0, %3 \n"
+ "mov %0, %3 \n"
- "st1 {v20.d}[0], [%0], %7 \n"
- "st1 {v22.d}[0], [%0], %7 \n"
- "st1 {v21.d}[0], [%0], %7 \n"
- "st1 {v23.d}[0], [%0], %7 \n"
- "st1 {v20.d}[1], [%0], %7 \n"
- "st1 {v22.d}[1], [%0], %7 \n"
- "st1 {v21.d}[1], [%0], %7 \n"
- "st1 {v23.d}[1], [%0] \n"
+ "st1 {v20.d}[0], [%0], %7 \n"
+ "st1 {v22.d}[0], [%0], %7 \n"
+ "st1 {v21.d}[0], [%0], %7 \n"
+ "st1 {v23.d}[0], [%0], %7 \n"
+ "st1 {v20.d}[1], [%0], %7 \n"
+ "st1 {v22.d}[1], [%0], %7 \n"
+ "st1 {v21.d}[1], [%0], %7 \n"
+ "st1 {v23.d}[1], [%0] \n"
- "add %1, %1, #16 \n" // src += 8*2
- "add %2, %2, %6, lsl #3 \n" // dst_a += 8 *
+ "add %1, %1, #16 \n" // src += 8*2
+ "add %2, %2, %6, lsl #3 \n" // dst_a += 8 *
// dst_stride_a
- "add %3, %3, %7, lsl #3 \n" // dst_b += 8 *
+ "add %3, %3, %7, lsl #3 \n" // dst_b += 8 *
// dst_stride_b
- "subs %w4, %w4, #8 \n" // w -= 8
- "b.ge 1b \n"
+ "subs %w4, %w4, #8 \n" // w -= 8
+ "b.ge 1b \n"
// add 8 back to counter. if the result is 0 there are
// no residuals.
- "adds %w4, %w4, #8 \n"
- "b.eq 4f \n"
+ "adds %w4, %w4, #8 \n"
+ "b.eq 4f \n"
// some residual, so between 1 and 7 lines left to transpose
- "cmp %w4, #2 \n"
- "b.lt 3f \n"
+ "cmp %w4, #2 \n"
+ "b.lt 3f \n"
- "cmp %w4, #4 \n"
- "b.lt 2f \n"
+ "cmp %w4, #4 \n"
+ "b.lt 2f \n"
// TODO(frkoenig): Clean this up
// 4x8 block
diff --git a/TMessagesProj/jni/third_party/libyuv/source/row_any.cc b/TMessagesProj/jni/third_party/libyuv/source/row_any.cc
index 9b29b2bfb..7216373bc 100644
--- a/TMessagesProj/jni/third_party/libyuv/source/row_any.cc
+++ b/TMessagesProj/jni/third_party/libyuv/source/row_any.cc
@@ -546,12 +546,6 @@ ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7)
#if defined(HAS_J400TOARGBROW_AVX2)
ANY11(J400ToARGBRow_Any_AVX2, J400ToARGBRow_AVX2, 0, 1, 4, 15)
#endif
-#if defined(HAS_I400TOARGBROW_SSE2)
-ANY11(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, 0, 1, 4, 7)
-#endif
-#if defined(HAS_I400TOARGBROW_AVX2)
-ANY11(I400ToARGBRow_Any_AVX2, I400ToARGBRow_AVX2, 0, 1, 4, 15)
-#endif
#if defined(HAS_RGB24TOARGBROW_SSSE3)
ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15)
ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15)
@@ -581,7 +575,6 @@ ANY11(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, 0, 4, 2, 7)
ANY11(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, 0, 4, 2, 7)
ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7)
ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7)
-ANY11(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, 0, 1, 4, 7)
#endif
#if defined(HAS_ARGBTORGB24ROW_MSA)
ANY11(ARGBToRGB24Row_Any_MSA, ARGBToRGB24Row_MSA, 0, 4, 3, 15)
@@ -590,7 +583,6 @@ ANY11(ARGBToRGB565Row_Any_MSA, ARGBToRGB565Row_MSA, 0, 4, 2, 7)
ANY11(ARGBToARGB1555Row_Any_MSA, ARGBToARGB1555Row_MSA, 0, 4, 2, 7)
ANY11(ARGBToARGB4444Row_Any_MSA, ARGBToARGB4444Row_MSA, 0, 4, 2, 7)
ANY11(J400ToARGBRow_Any_MSA, J400ToARGBRow_MSA, 0, 1, 4, 15)
-ANY11(I400ToARGBRow_Any_MSA, I400ToARGBRow_MSA, 0, 1, 4, 15)
#endif
#if defined(HAS_ARGBTORGB24ROW_MMI)
ANY11(ARGBToRGB24Row_Any_MMI, ARGBToRGB24Row_MMI, 0, 4, 3, 3)
@@ -599,7 +591,6 @@ ANY11(ARGBToRGB565Row_Any_MMI, ARGBToRGB565Row_MMI, 0, 4, 2, 3)
ANY11(ARGBToARGB1555Row_Any_MMI, ARGBToARGB1555Row_MMI, 0, 4, 2, 3)
ANY11(ARGBToARGB4444Row_Any_MMI, ARGBToARGB4444Row_MMI, 0, 4, 2, 3)
ANY11(J400ToARGBRow_Any_MMI, J400ToARGBRow_MMI, 0, 1, 4, 3)
-ANY11(I400ToARGBRow_Any_MMI, I400ToARGBRow_MMI, 0, 1, 4, 7)
#endif
#if defined(HAS_RAWTORGB24ROW_NEON)
ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7)
@@ -695,6 +686,15 @@ ANY11(RGBAToYRow_Any_MMI, RGBAToYRow_MMI, 0, 4, 1, 7)
#ifdef HAS_RGB24TOYROW_NEON
ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 7)
#endif
+#ifdef HAS_RGB24TOYJROW_AVX2
+ANY11(RGB24ToYJRow_Any_AVX2, RGB24ToYJRow_AVX2, 0, 3, 1, 31)
+#endif
+#ifdef HAS_RGB24TOYJROW_SSSE3
+ANY11(RGB24ToYJRow_Any_SSSE3, RGB24ToYJRow_SSSE3, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RGB24TOYJROW_NEON
+ANY11(RGB24ToYJRow_Any_NEON, RGB24ToYJRow_NEON, 0, 3, 1, 7)
+#endif
#ifdef HAS_RGB24TOYROW_MSA
ANY11(RGB24ToYRow_Any_MSA, RGB24ToYRow_MSA, 0, 3, 1, 15)
#endif
@@ -704,6 +704,15 @@ ANY11(RGB24ToYRow_Any_MMI, RGB24ToYRow_MMI, 0, 3, 1, 7)
#ifdef HAS_RAWTOYROW_NEON
ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 7)
#endif
+#ifdef HAS_RAWTOYJROW_AVX2
+ANY11(RAWToYJRow_Any_AVX2, RAWToYJRow_AVX2, 0, 3, 1, 31)
+#endif
+#ifdef HAS_RAWTOYJROW_SSSE3
+ANY11(RAWToYJRow_Any_SSSE3, RAWToYJRow_SSSE3, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RAWTOYJROW_NEON
+ANY11(RAWToYJRow_Any_NEON, RAWToYJRow_NEON, 0, 3, 1, 7)
+#endif
#ifdef HAS_RAWTOYROW_MSA
ANY11(RAWToYRow_Any_MSA, RAWToYRow_MSA, 0, 3, 1, 15)
#endif
@@ -901,6 +910,47 @@ ANY11B(ARGBCopyYToAlphaRow_Any_MMI, ARGBCopyYToAlphaRow_MMI, 0, 1, 4, 7)
memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \
}
+#if defined(HAS_I400TOARGBROW_SSE2)
+ANY11P(I400ToARGBRow_Any_SSE2,
+ I400ToARGBRow_SSE2,
+ const struct YuvConstants*,
+ 1,
+ 4,
+ 7)
+#endif
+#if defined(HAS_I400TOARGBROW_AVX2)
+ANY11P(I400ToARGBRow_Any_AVX2,
+ I400ToARGBRow_AVX2,
+ const struct YuvConstants*,
+ 1,
+ 4,
+ 15)
+#endif
+#if defined(HAS_I400TOARGBROW_NEON)
+ANY11P(I400ToARGBRow_Any_NEON,
+ I400ToARGBRow_NEON,
+ const struct YuvConstants*,
+ 1,
+ 4,
+ 7)
+#endif
+#if defined(HAS_I400TOARGBROW_MSA)
+ANY11P(I400ToARGBRow_Any_MSA,
+ I400ToARGBRow_MSA,
+ const struct YuvConstants*,
+ 1,
+ 4,
+ 15)
+#endif
+#if defined(HAS_I400TOARGBROW_MMI)
+ANY11P(I400ToARGBRow_Any_MMI,
+ I400ToARGBRow_MMI,
+ const struct YuvConstants*,
+ 1,
+ 4,
+ 7)
+#endif
+
#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
ANY11P(ARGBToRGB565DitherRow_Any_SSE2,
ARGBToRGB565DitherRow_SSE2,
@@ -1156,7 +1206,7 @@ ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31)
ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15)
#endif
#ifdef HAS_MIRRORROW_NEON
-ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15)
+ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 31)
#endif
#ifdef HAS_MIRRORROW_MSA
ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63)
@@ -1164,6 +1214,18 @@ ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63)
#ifdef HAS_MIRRORROW_MMI
ANY11M(MirrorRow_Any_MMI, MirrorRow_MMI, 1, 7)
#endif
+#ifdef HAS_MIRRORUVROW_AVX2
+ANY11M(MirrorUVRow_Any_AVX2, MirrorUVRow_AVX2, 2, 15)
+#endif
+#ifdef HAS_MIRRORUVROW_SSSE3
+ANY11M(MirrorUVRow_Any_SSSE3, MirrorUVRow_SSSE3, 2, 7)
+#endif
+#ifdef HAS_MIRRORUVROW_NEON
+ANY11M(MirrorUVRow_Any_NEON, MirrorUVRow_NEON, 2, 31)
+#endif
+#ifdef HAS_MIRRORUVROW_MSA
+ANY11M(MirrorUVRow_Any_MSA, MirrorUVRow_MSA, 2, 7)
+#endif
#ifdef HAS_ARGBMIRRORROW_AVX2
ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
#endif
@@ -1171,7 +1233,7 @@ ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3)
#endif
#ifdef HAS_ARGBMIRRORROW_NEON
-ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 3)
+ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 7)
#endif
#ifdef HAS_ARGBMIRRORROW_MSA
ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15)
@@ -1179,12 +1241,19 @@ ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15)
#ifdef HAS_ARGBMIRRORROW_MMI
ANY11M(ARGBMirrorRow_Any_MMI, ARGBMirrorRow_MMI, 4, 1)
#endif
+#ifdef HAS_RGB24MIRRORROW_SSSE3
+ANY11M(RGB24MirrorRow_Any_SSSE3, RGB24MirrorRow_SSSE3, 3, 15)
+#endif
+#ifdef HAS_RGB24MIRRORROW_NEON
+ANY11M(RGB24MirrorRow_Any_NEON, RGB24MirrorRow_NEON, 3, 15)
+#endif
#undef ANY11M
// Any 1 plane. (memset)
#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK) \
void NAMEANY(uint8_t* dst_ptr, T v32, int width) { \
SIMD_ALIGNED(uint8_t temp[64]); \
+ memset(temp, 0, 64); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
@@ -1371,7 +1440,7 @@ ANY12S(ARGBToUVJRow_Any_MMI, ARGBToUVJRow_MMI, 0, 4, 15)
ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15)
#endif
#ifdef HAS_BGRATOUVROW_MSA
-ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 31)
+ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 15)
#endif
#ifdef HAS_BGRATOUVROW_MMI
ANY12S(BGRAToUVRow_Any_MMI, BGRAToUVRow_MMI, 0, 4, 15)
@@ -1380,7 +1449,7 @@ ANY12S(BGRAToUVRow_Any_MMI, BGRAToUVRow_MMI, 0, 4, 15)
ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15)
#endif
#ifdef HAS_ABGRTOUVROW_MSA
-ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 31)
+ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 15)
#endif
#ifdef HAS_ABGRTOUVROW_MMI
ANY12S(ABGRToUVRow_Any_MMI, ABGRToUVRow_MMI, 0, 4, 15)
@@ -1389,7 +1458,7 @@ ANY12S(ABGRToUVRow_Any_MMI, ABGRToUVRow_MMI, 0, 4, 15)
ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15)
#endif
#ifdef HAS_RGBATOUVROW_MSA
-ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 31)
+ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 15)
#endif
#ifdef HAS_RGBATOUVROW_MMI
ANY12S(RGBAToUVRow_Any_MMI, RGBAToUVRow_MMI, 0, 4, 15)
diff --git a/TMessagesProj/jni/third_party/libyuv/source/row_common.cc b/TMessagesProj/jni/third_party/libyuv/source/row_common.cc
index 70aa2e13c..79aed5c78 100644
--- a/TMessagesProj/jni/third_party/libyuv/source/row_common.cc
+++ b/TMessagesProj/jni/third_party/libyuv/source/row_common.cc
@@ -14,6 +14,7 @@
#include // For memcpy and memset.
#include "libyuv/basic_types.h"
+#include "libyuv/convert_argb.h" // For kYuvI601Constants
#ifdef __cplusplus
namespace libyuv {
@@ -26,10 +27,11 @@ extern "C" {
(defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))
#define LIBYUV_RGB7 1
#endif
-// mips use 7 bit RGBToY
-#if (!defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)) || \
- (!defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa))
-#define LIBYUV_RGB7 1
+
+#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
+ defined(_M_IX86)
+#define LIBYUV_ARGBTOUV_PAVGB 1
+#define LIBYUV_RGBTOU_TRUNCATE 1
#endif
// llvm x86 is poor at ternary operator, so use branchless min/max.
@@ -37,19 +39,19 @@ extern "C" {
#define USE_BRANCHLESS 1
#if USE_BRANCHLESS
static __inline int32_t clamp0(int32_t v) {
- return ((-(v) >> 31) & (v));
+ return -(v >= 0) & v;
}
-
+// TODO(fbarchard): make clamp255 preserve negative values.
static __inline int32_t clamp255(int32_t v) {
- return (((255 - (v)) >> 31) | (v)) & 255;
+ return (-(v >= 255) | v) & 255;
}
static __inline int32_t clamp1023(int32_t v) {
- return (((1023 - (v)) >> 31) | (v)) & 1023;
+ return (-(v >= 1023) | v) & 1023;
}
static __inline uint32_t Abs(int32_t v) {
- int m = v >> 31;
+ int m = -(v < 0);
return (v + m) ^ m;
}
#else // USE_BRANCHLESS
@@ -208,7 +210,8 @@ void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444,
void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width) {
int x;
for (x = 0; x < width; ++x) {
- uint32_t ar30 = *(const uint32_t*)src_ar30;
+ uint32_t ar30;
+ memcpy(&ar30, src_ar30, sizeof ar30);
uint32_t b = (ar30 >> 2) & 0xff;
uint32_t g = (ar30 >> 12) & 0xff;
uint32_t r = (ar30 >> 22) & 0xff;
@@ -222,7 +225,8 @@ void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width) {
void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width) {
int x;
for (x = 0; x < width; ++x) {
- uint32_t ar30 = *(const uint32_t*)src_ar30;
+ uint32_t ar30;
+ memcpy(&ar30, src_ar30, sizeof ar30);
uint32_t b = (ar30 >> 2) & 0xff;
uint32_t g = (ar30 >> 12) & 0xff;
uint32_t r = (ar30 >> 22) & 0xff;
@@ -236,7 +240,8 @@ void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width) {
void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width) {
int x;
for (x = 0; x < width; ++x) {
- uint32_t ar30 = *(const uint32_t*)src_ar30;
+ uint32_t ar30;
+ memcpy(&ar30, src_ar30, sizeof ar30);
uint32_t b = ar30 & 0x3ff;
uint32_t ga = ar30 & 0xc00ffc00;
uint32_t r = (ar30 >> 20) & 0x3ff;
@@ -425,14 +430,38 @@ static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) {
}
#endif
+#define AVGB(a, b) (((a) + (b) + 1) >> 1)
+
+#ifdef LIBYUV_RGBTOU_TRUNCATE
+static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) {
+ return (112 * b - 74 * g - 38 * r + 0x8000) >> 8;
+}
+static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
+ return (112 * r - 94 * g - 18 * b + 0x8000) >> 8;
+}
+#else
+// TODO(fbarchard): Add rounding to SIMD and use this
static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) {
return (112 * b - 74 * g - 38 * r + 0x8080) >> 8;
}
static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;
}
+#endif
+
+#if !defined(LIBYUV_ARGBTOUV_PAVGB)
+static __inline int RGB2xToU(uint16_t r, uint16_t g, uint16_t b) {
+ return ((112 / 2) * b - (74 / 2) * g - (38 / 2) * r + 0x8080) >> 8;
+}
+static __inline int RGB2xToV(uint16_t r, uint16_t g, uint16_t b) {
+ return ((112 / 2) * r - (94 / 2) * g - (18 / 2) * b + 0x8080) >> 8;
+}
+#endif
// ARGBToY_C and ARGBToUV_C
+// Intel version mimic SSE/AVX which does 2 pavgb
+#if LIBYUV_ARGBTOUV_PAVGB
+
#define MAKEROWY(NAME, R, G, B, BPP) \
void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
int x; \
@@ -447,15 +476,12 @@ static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \
int x; \
for (x = 0; x < width - 1; x += 2) { \
- uint8_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] + \
- src_rgb1[B + BPP]) >> \
- 2; \
- uint8_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] + \
- src_rgb1[G + BPP]) >> \
- 2; \
- uint8_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] + \
- src_rgb1[R + BPP]) >> \
- 2; \
+ uint8_t ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]), \
+ AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP])); \
+ uint8_t ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]), \
+ AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP])); \
+ uint8_t ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]), \
+ AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP])); \
dst_u[0] = RGBToU(ar, ag, ab); \
dst_v[0] = RGBToV(ar, ag, ab); \
src_rgb0 += BPP * 2; \
@@ -464,13 +490,54 @@ static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
dst_v += 1; \
} \
if (width & 1) { \
- uint8_t ab = (src_rgb0[B] + src_rgb1[B]) >> 1; \
- uint8_t ag = (src_rgb0[G] + src_rgb1[G]) >> 1; \
- uint8_t ar = (src_rgb0[R] + src_rgb1[R]) >> 1; \
+ uint8_t ab = AVGB(src_rgb0[B], src_rgb1[B]); \
+ uint8_t ag = AVGB(src_rgb0[G], src_rgb1[G]); \
+ uint8_t ar = AVGB(src_rgb0[R], src_rgb1[R]); \
dst_u[0] = RGBToU(ar, ag, ab); \
dst_v[0] = RGBToV(ar, ag, ab); \
} \
}
+#else
+// ARM version does sum / 2 then multiply by 2x smaller coefficients
+#define MAKEROWY(NAME, R, G, B, BPP) \
+ void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
+ int x; \
+ for (x = 0; x < width; ++x) { \
+ dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \
+ src_argb0 += BPP; \
+ dst_y += 1; \
+ } \
+ } \
+ void NAME##ToUVRow_C(const uint8_t* src_rgb0, int src_stride_rgb, \
+ uint8_t* dst_u, uint8_t* dst_v, int width) { \
+ const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \
+ int x; \
+ for (x = 0; x < width - 1; x += 2) { \
+ uint16_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] + \
+ src_rgb1[B + BPP] + 1) >> \
+ 1; \
+ uint16_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] + \
+ src_rgb1[G + BPP] + 1) >> \
+ 1; \
+ uint16_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] + \
+ src_rgb1[R + BPP] + 1) >> \
+ 1; \
+ dst_u[0] = RGB2xToU(ar, ag, ab); \
+ dst_v[0] = RGB2xToV(ar, ag, ab); \
+ src_rgb0 += BPP * 2; \
+ src_rgb1 += BPP * 2; \
+ dst_u += 1; \
+ dst_v += 1; \
+ } \
+ if (width & 1) { \
+ uint16_t ab = src_rgb0[B] + src_rgb1[B]; \
+ uint16_t ag = src_rgb0[G] + src_rgb1[G]; \
+ uint16_t ar = src_rgb0[R] + src_rgb1[R]; \
+ dst_u[0] = RGB2xToU(ar, ag, ab); \
+ dst_v[0] = RGB2xToV(ar, ag, ab); \
+ } \
+ }
+#endif
MAKEROWY(ARGB, 2, 1, 0, 4)
MAKEROWY(BGRA, 1, 2, 3, 4)
@@ -517,16 +584,25 @@ static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) {
}
#endif
+#if defined(LIBYUV_ARGBTOUV_PAVGB)
static __inline int RGBToUJ(uint8_t r, uint8_t g, uint8_t b) {
return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;
}
static __inline int RGBToVJ(uint8_t r, uint8_t g, uint8_t b) {
return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
}
-
-#define AVGB(a, b) (((a) + (b) + 1) >> 1)
+#else
+static __inline int RGB2xToUJ(uint16_t r, uint16_t g, uint16_t b) {
+ return ((127 / 2) * b - (84 / 2) * g - (43 / 2) * r + 0x8080) >> 8;
+}
+static __inline int RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) {
+ return ((127 / 2) * r - (107 / 2) * g - (20 / 2) * b + 0x8080) >> 8;
+}
+#endif
// ARGBToYJ_C and ARGBToUVJ_C
+// Intel version mimic SSE/AVX which does 2 pavgb
+#if LIBYUV_ARGBTOUV_PAVGB
#define MAKEROWYJ(NAME, R, G, B, BPP) \
void NAME##ToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
int x; \
@@ -562,9 +638,53 @@ static __inline int RGBToVJ(uint8_t r, uint8_t g, uint8_t b) {
dst_v[0] = RGBToVJ(ar, ag, ab); \
} \
}
+#else
+// ARM version does sum / 2 then multiply by 2x smaller coefficients
+#define MAKEROWYJ(NAME, R, G, B, BPP) \
+ void NAME##ToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
+ int x; \
+ for (x = 0; x < width; ++x) { \
+ dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]); \
+ src_argb0 += BPP; \
+ dst_y += 1; \
+ } \
+ } \
+ void NAME##ToUVJRow_C(const uint8_t* src_rgb0, int src_stride_rgb, \
+ uint8_t* dst_u, uint8_t* dst_v, int width) { \
+ const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \
+ int x; \
+ for (x = 0; x < width - 1; x += 2) { \
+ uint16_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] + \
+ src_rgb1[B + BPP] + 1) >> \
+ 1; \
+ uint16_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] + \
+ src_rgb1[G + BPP] + 1) >> \
+ 1; \
+ uint16_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] + \
+ src_rgb1[R + BPP] + 1) >> \
+ 1; \
+ dst_u[0] = RGB2xToUJ(ar, ag, ab); \
+ dst_v[0] = RGB2xToVJ(ar, ag, ab); \
+ src_rgb0 += BPP * 2; \
+ src_rgb1 += BPP * 2; \
+ dst_u += 1; \
+ dst_v += 1; \
+ } \
+ if (width & 1) { \
+ uint16_t ab = (src_rgb0[B] + src_rgb1[B]); \
+ uint16_t ag = (src_rgb0[G] + src_rgb1[G]); \
+ uint16_t ar = (src_rgb0[R] + src_rgb1[R]); \
+ dst_u[0] = RGB2xToUJ(ar, ag, ab); \
+ dst_v[0] = RGB2xToVJ(ar, ag, ab); \
+ } \
+ }
+
+#endif
MAKEROWYJ(ARGB, 2, 1, 0, 4)
MAKEROWYJ(RGBA, 3, 2, 1, 4)
+MAKEROWYJ(RGB24, 2, 1, 0, 3)
+MAKEROWYJ(RAW, 0, 1, 2, 3)
#undef MAKEROWYJ
void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
@@ -632,13 +752,34 @@ void RGB565ToUVRow_C(const uint8_t* src_rgb565,
uint8_t b3 = next_rgb565[2] & 0x1f;
uint8_t g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);
uint8_t r3 = next_rgb565[3] >> 3;
- uint8_t b = (b0 + b1 + b2 + b3); // 565 * 4 = 787.
- uint8_t g = (g0 + g1 + g2 + g3);
- uint8_t r = (r0 + r1 + r2 + r3);
- b = (b << 1) | (b >> 6); // 787 -> 888.
- r = (r << 1) | (r >> 6);
- dst_u[0] = RGBToU(r, g, b);
- dst_v[0] = RGBToV(r, g, b);
+
+ b0 = (b0 << 3) | (b0 >> 2);
+ g0 = (g0 << 2) | (g0 >> 4);
+ r0 = (r0 << 3) | (r0 >> 2);
+ b1 = (b1 << 3) | (b1 >> 2);
+ g1 = (g1 << 2) | (g1 >> 4);
+ r1 = (r1 << 3) | (r1 >> 2);
+ b2 = (b2 << 3) | (b2 >> 2);
+ g2 = (g2 << 2) | (g2 >> 4);
+ r2 = (r2 << 3) | (r2 >> 2);
+ b3 = (b3 << 3) | (b3 >> 2);
+ g3 = (g3 << 2) | (g3 >> 4);
+ r3 = (r3 << 3) | (r3 >> 2);
+
+#if LIBYUV_ARGBTOUV_PAVGB
+ uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
+ uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
+ uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+#else
+ uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
+ uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
+ uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1;
+ dst_u[0] = RGB2xToU(r, g, b);
+ dst_v[0] = RGB2xToV(r, g, b);
+#endif
+
src_rgb565 += 4;
next_rgb565 += 4;
dst_u += 1;
@@ -651,14 +792,27 @@ void RGB565ToUVRow_C(const uint8_t* src_rgb565,
uint8_t b2 = next_rgb565[0] & 0x1f;
uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
uint8_t r2 = next_rgb565[1] >> 3;
- uint8_t b = (b0 + b2); // 565 * 2 = 676.
- uint8_t g = (g0 + g2);
- uint8_t r = (r0 + r2);
- b = (b << 2) | (b >> 4); // 676 -> 888
- g = (g << 1) | (g >> 6);
- r = (r << 2) | (r >> 4);
- dst_u[0] = RGBToU(r, g, b);
- dst_v[0] = RGBToV(r, g, b);
+
+ b0 = (b0 << 3) | (b0 >> 2);
+ g0 = (g0 << 2) | (g0 >> 4);
+ r0 = (r0 << 3) | (r0 >> 2);
+ b2 = (b2 << 3) | (b2 >> 2);
+ g2 = (g2 << 2) | (g2 >> 4);
+ r2 = (r2 << 3) | (r2 >> 2);
+
+#if LIBYUV_ARGBTOUV_PAVGB
+ uint8_t ab = AVGB(b0, b2);
+ uint8_t ag = AVGB(g0, g2);
+ uint8_t ar = AVGB(r0, r2);
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+#else
+ uint16_t b = b0 + b2;
+ uint16_t g = g0 + g2;
+ uint16_t r = r0 + r2;
+ dst_u[0] = RGB2xToU(r, g, b);
+ dst_v[0] = RGB2xToV(r, g, b);
+#endif
}
}
@@ -682,14 +836,34 @@ void ARGB1555ToUVRow_C(const uint8_t* src_argb1555,
uint8_t b3 = next_argb1555[2] & 0x1f;
uint8_t g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);
uint8_t r3 = (next_argb1555[3] & 0x7c) >> 2;
- uint8_t b = (b0 + b1 + b2 + b3); // 555 * 4 = 777.
- uint8_t g = (g0 + g1 + g2 + g3);
- uint8_t r = (r0 + r1 + r2 + r3);
- b = (b << 1) | (b >> 6); // 777 -> 888.
- g = (g << 1) | (g >> 6);
- r = (r << 1) | (r >> 6);
- dst_u[0] = RGBToU(r, g, b);
- dst_v[0] = RGBToV(r, g, b);
+
+ b0 = (b0 << 3) | (b0 >> 2);
+ g0 = (g0 << 3) | (g0 >> 2);
+ r0 = (r0 << 3) | (r0 >> 2);
+ b1 = (b1 << 3) | (b1 >> 2);
+ g1 = (g1 << 3) | (g1 >> 2);
+ r1 = (r1 << 3) | (r1 >> 2);
+ b2 = (b2 << 3) | (b2 >> 2);
+ g2 = (g2 << 3) | (g2 >> 2);
+ r2 = (r2 << 3) | (r2 >> 2);
+ b3 = (b3 << 3) | (b3 >> 2);
+ g3 = (g3 << 3) | (g3 >> 2);
+ r3 = (r3 << 3) | (r3 >> 2);
+
+#if LIBYUV_ARGBTOUV_PAVGB
+ uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
+ uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
+ uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+#else
+ uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
+ uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
+ uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1;
+ dst_u[0] = RGB2xToU(r, g, b);
+ dst_v[0] = RGB2xToV(r, g, b);
+#endif
+
src_argb1555 += 4;
next_argb1555 += 4;
dst_u += 1;
@@ -702,14 +876,27 @@ void ARGB1555ToUVRow_C(const uint8_t* src_argb1555,
uint8_t b2 = next_argb1555[0] & 0x1f;
uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
uint8_t r2 = next_argb1555[1] >> 3;
- uint8_t b = (b0 + b2); // 555 * 2 = 666.
- uint8_t g = (g0 + g2);
- uint8_t r = (r0 + r2);
- b = (b << 2) | (b >> 4); // 666 -> 888.
- g = (g << 2) | (g >> 4);
- r = (r << 2) | (r >> 4);
- dst_u[0] = RGBToU(r, g, b);
- dst_v[0] = RGBToV(r, g, b);
+
+ b0 = (b0 << 3) | (b0 >> 2);
+ g0 = (g0 << 3) | (g0 >> 2);
+ r0 = (r0 << 3) | (r0 >> 2);
+ b2 = (b2 << 3) | (b2 >> 2);
+ g2 = (g2 << 3) | (g2 >> 2);
+ r2 = (r2 << 3) | (r2 >> 2);
+
+#if LIBYUV_ARGBTOUV_PAVGB
+ uint8_t ab = AVGB(b0, b2);
+ uint8_t ag = AVGB(g0, g2);
+ uint8_t ar = AVGB(r0, r2);
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+#else
+ uint16_t b = b0 + b2;
+ uint16_t g = g0 + g2;
+ uint16_t r = r0 + r2;
+ dst_u[0] = RGB2xToU(r, g, b);
+ dst_v[0] = RGB2xToV(r, g, b);
+#endif
}
}
@@ -733,14 +920,34 @@ void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,
uint8_t b3 = next_argb4444[2] & 0x0f;
uint8_t g3 = next_argb4444[2] >> 4;
uint8_t r3 = next_argb4444[3] & 0x0f;
- uint8_t b = (b0 + b1 + b2 + b3); // 444 * 4 = 666.
- uint8_t g = (g0 + g1 + g2 + g3);
- uint8_t r = (r0 + r1 + r2 + r3);
- b = (b << 2) | (b >> 4); // 666 -> 888.
- g = (g << 2) | (g >> 4);
- r = (r << 2) | (r >> 4);
- dst_u[0] = RGBToU(r, g, b);
- dst_v[0] = RGBToV(r, g, b);
+
+ b0 = (b0 << 4) | b0;
+ g0 = (g0 << 4) | g0;
+ r0 = (r0 << 4) | r0;
+ b1 = (b1 << 4) | b1;
+ g1 = (g1 << 4) | g1;
+ r1 = (r1 << 4) | r1;
+ b2 = (b2 << 4) | b2;
+ g2 = (g2 << 4) | g2;
+ r2 = (r2 << 4) | r2;
+ b3 = (b3 << 4) | b3;
+ g3 = (g3 << 4) | g3;
+ r3 = (r3 << 4) | r3;
+
+#if LIBYUV_ARGBTOUV_PAVGB
+ uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
+ uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
+ uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+#else
+ uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
+ uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
+ uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1;
+ dst_u[0] = RGB2xToU(r, g, b);
+ dst_v[0] = RGB2xToV(r, g, b);
+#endif
+
src_argb4444 += 4;
next_argb4444 += 4;
dst_u += 1;
@@ -753,14 +960,27 @@ void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,
uint8_t b2 = next_argb4444[0] & 0x0f;
uint8_t g2 = next_argb4444[0] >> 4;
uint8_t r2 = next_argb4444[1] & 0x0f;
- uint8_t b = (b0 + b2); // 444 * 2 = 555.
- uint8_t g = (g0 + g2);
- uint8_t r = (r0 + r2);
- b = (b << 3) | (b >> 2); // 555 -> 888.
- g = (g << 3) | (g >> 2);
- r = (r << 3) | (r >> 2);
- dst_u[0] = RGBToU(r, g, b);
- dst_v[0] = RGBToV(r, g, b);
+
+ b0 = (b0 << 4) | b0;
+ g0 = (g0 << 4) | g0;
+ r0 = (r0 << 4) | r0;
+ b2 = (b2 << 4) | b2;
+ g2 = (g2 << 4) | g2;
+ r2 = (r2 << 4) | r2;
+
+#if LIBYUV_ARGBTOUV_PAVGB
+ uint8_t ab = AVGB(b0, b2);
+ uint8_t ag = AVGB(g0, g2);
+ uint8_t ar = AVGB(r0, r2);
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+#else
+ uint16_t b = b0 + b2;
+ uint16_t g = g0 + g2;
+ uint16_t r = r0 + r2;
+ dst_u[0] = RGB2xToU(r, g, b);
+ dst_v[0] = RGB2xToV(r, g, b);
+#endif
}
}
@@ -1136,26 +1356,26 @@ const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
{-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
{UG, VG, UG, VG, UG, VG, UG, VG},
{UG, VG, UG, VG, UG, VG, UG, VG},
- {BB, BG, BR, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
+ {BB, BG, BR, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
{-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
{-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
{VG, UG, VG, UG, VG, UG, VG, UG},
{VG, UG, VG, UG, VG, UG, VG, UG},
- {BR, BG, BB, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
+ {BR, BG, BB, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
#elif defined(__arm__) // 32 bit arm
const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
{-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
{UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
- {BB, BG, BR, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
+ {BB, BG, BR, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
{-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
{VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
- {BR, BG, BB, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
+ {BR, BG, BB, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
#else
const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
{UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
@@ -1167,7 +1387,9 @@ const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
{BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
{BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
{BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
- {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},
+ {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB,
+ YGB}};
const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
{VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
@@ -1178,7 +1400,9 @@ const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
{BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
{BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
{BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
- {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},
+ {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB,
+ YGB}};
#endif
#undef BB
@@ -1217,26 +1441,26 @@ const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
{-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
{UG, VG, UG, VG, UG, VG, UG, VG},
{UG, VG, UG, VG, UG, VG, UG, VG},
- {BB, BG, BR, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
+ {BB, BG, BR, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
{-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
{-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
{VG, UG, VG, UG, VG, UG, VG, UG},
{VG, UG, VG, UG, VG, UG, VG, UG},
- {BR, BG, BB, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
+ {BR, BG, BB, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
#elif defined(__arm__)
const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
{-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
{UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
- {BB, BG, BR, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
+ {BB, BG, BR, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
{-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
{VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
- {BR, BG, BB, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
+ {BR, BG, BB, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
#else
const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
{UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
@@ -1248,7 +1472,9 @@ const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
{BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
{BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
{BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
- {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},
+ {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB,
+ YGB}};
const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
{VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
@@ -1259,7 +1485,9 @@ const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
{BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
{BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
{BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
- {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},
+ {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB,
+ YGB}};
#endif
#undef BB
@@ -1300,26 +1528,26 @@ const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
{-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
{UG, VG, UG, VG, UG, VG, UG, VG},
{UG, VG, UG, VG, UG, VG, UG, VG},
- {BB, BG, BR, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
+ {BB, BG, BR, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
{-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
{-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
{VG, UG, VG, UG, VG, UG, VG, UG},
{VG, UG, VG, UG, VG, UG, VG, UG},
- {BR, BG, BB, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
+ {BR, BG, BB, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
#elif defined(__arm__)
const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
{-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
{UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
- {BB, BG, BR, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
+ {BB, BG, BR, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
{-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
{VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
- {BR, BG, BB, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
+ {BR, BG, BB, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
#else
const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
{UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
@@ -1331,7 +1559,9 @@ const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
{BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
{BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
{BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
- {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},
+ {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB,
+ YGB}};
const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
{VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
@@ -1342,7 +1572,9 @@ const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
{BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
{BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
{BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
- {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},
+ {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB,
+ YGB}};
#endif
#undef BB
@@ -1357,7 +1589,7 @@ const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
// BT.2020 YUV to RGB reference
// R = (Y - 16) * 1.164384 - V * -1.67867
-// G = (Y - 16) * 1.164384 - U * 0.187326 - V * -0.65042
+// G = (Y - 16) * 1.164384 - U * 0.187326 - V * 0.65042
// B = (Y - 16) * 1.164384 - U * -2.14177
// Y contribution to R,G,B. Scale and bias.
@@ -1365,6 +1597,7 @@ const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
#define YGB -1160 /* 1.164384 * 64 * -16 + 64 / 2 */
// TODO(fbarchard): Improve accuracy; the B channel is off by 7%.
+// U and V contributions to R,G,B.
#define UB -128 /* max(-128, round(-2.142 * 64)) */
#define UG 12 /* round(0.187326 * 64) */
#define VG 42 /* round(0.65042 * 64) */
@@ -1381,26 +1614,26 @@ const struct YuvConstants SIMD_ALIGNED(kYuv2020Constants) = {
{-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
{UG, VG, UG, VG, UG, VG, UG, VG},
{UG, VG, UG, VG, UG, VG, UG, VG},
- {BB, BG, BR, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
+ {BB, BG, BR, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
const struct YuvConstants SIMD_ALIGNED(kYvu2020Constants) = {
{-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
{-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
{VG, UG, VG, UG, VG, UG, VG, UG},
{VG, UG, VG, UG, VG, UG, VG, UG},
- {BR, BG, BB, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
+ {BR, BG, BB, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
#elif defined(__arm__)
const struct YuvConstants SIMD_ALIGNED(kYuv2020Constants) = {
{-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
{UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
- {BB, BG, BR, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
+ {BB, BG, BR, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
const struct YuvConstants SIMD_ALIGNED(kYvu2020Constants) = {
{-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
{VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
- {BR, BG, BB, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
+ {BR, BG, BB, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
#else
const struct YuvConstants SIMD_ALIGNED(kYuv2020Constants) = {
{UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
@@ -1412,7 +1645,9 @@ const struct YuvConstants SIMD_ALIGNED(kYuv2020Constants) = {
{BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
{BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
{BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
- {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},
+ {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB,
+ YGB}};
const struct YuvConstants SIMD_ALIGNED(kYvu2020Constants) = {
{VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
@@ -1423,7 +1658,9 @@ const struct YuvConstants SIMD_ALIGNED(kYvu2020Constants) = {
{BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
{BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
{BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
- {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},
+ {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB,
+ YGB}};
#endif
#undef BB
@@ -1438,7 +1675,6 @@ const struct YuvConstants SIMD_ALIGNED(kYvu2020Constants) = {
// C reference code that mimics the YUV assembly.
// Reads 8 bit YUV and leaves result as 16 bit.
-
static __inline void YuvPixel(uint8_t y,
uint8_t u,
uint8_t v,
@@ -1454,7 +1690,7 @@ static __inline void YuvPixel(uint8_t y,
int bb = yuvconstants->kUVBiasBGR[0];
int bg = yuvconstants->kUVBiasBGR[1];
int br = yuvconstants->kUVBiasBGR[2];
- int yg = yuvconstants->kYToRgb[0] / 0x0101;
+ int yg = yuvconstants->kYToRgb[1];
#elif defined(__arm__)
int ub = -yuvconstants->kUVToRB[0];
int ug = yuvconstants->kUVToG[0];
@@ -1463,7 +1699,7 @@ static __inline void YuvPixel(uint8_t y,
int bb = yuvconstants->kUVBiasBGR[0];
int bg = yuvconstants->kUVBiasBGR[1];
int br = yuvconstants->kUVBiasBGR[2];
- int yg = yuvconstants->kYToRgb[0] / 0x0101;
+ int yg = yuvconstants->kYToRgb[1];
#else
int ub = yuvconstants->kUVToB[0];
int ug = yuvconstants->kUVToG[0];
@@ -1497,7 +1733,7 @@ static __inline void YuvPixel8_16(uint8_t y,
int bb = yuvconstants->kUVBiasBGR[0];
int bg = yuvconstants->kUVBiasBGR[1];
int br = yuvconstants->kUVBiasBGR[2];
- int yg = yuvconstants->kYToRgb[0] / 0x0101;
+ int yg = yuvconstants->kYToRgb[1];
#elif defined(__arm__)
int ub = -yuvconstants->kUVToRB[0];
int ug = yuvconstants->kUVToG[0];
@@ -1506,7 +1742,7 @@ static __inline void YuvPixel8_16(uint8_t y,
int bb = yuvconstants->kUVBiasBGR[0];
int bg = yuvconstants->kUVBiasBGR[1];
int br = yuvconstants->kUVBiasBGR[2];
- int yg = yuvconstants->kYToRgb[0] / 0x0101;
+ int yg = yuvconstants->kYToRgb[1];
#else
int ub = yuvconstants->kUVToB[0];
int ug = yuvconstants->kUVToG[0];
@@ -1541,7 +1777,7 @@ static __inline void YuvPixel16(int16_t y,
int bb = yuvconstants->kUVBiasBGR[0];
int bg = yuvconstants->kUVBiasBGR[1];
int br = yuvconstants->kUVBiasBGR[2];
- int yg = yuvconstants->kYToRgb[0] / 0x0101;
+ int yg = yuvconstants->kYToRgb[1];
#elif defined(__arm__)
int ub = -yuvconstants->kUVToRB[0];
int ug = yuvconstants->kUVToG[0];
@@ -1550,7 +1786,7 @@ static __inline void YuvPixel16(int16_t y,
int bb = yuvconstants->kUVBiasBGR[0];
int bg = yuvconstants->kUVBiasBGR[1];
int br = yuvconstants->kUVBiasBGR[2];
- int yg = yuvconstants->kYToRgb[0] / 0x0101;
+ int yg = yuvconstants->kYToRgb[1];
#else
int ub = yuvconstants->kUVToB[0];
int ug = yuvconstants->kUVToG[0];
@@ -1588,21 +1824,26 @@ static __inline void YuvPixel10(uint16_t y,
*r = Clamp(r16 >> 6);
}
-// Y contribution to R,G,B. Scale and bias.
-#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
-#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
-
// C reference code that mimics the YUV assembly.
-static __inline void YPixel(uint8_t y, uint8_t* b, uint8_t* g, uint8_t* r) {
- uint32_t y1 = (uint32_t)(y * 0x0101 * YG) >> 16;
- *b = Clamp((int32_t)(y1 + YGB) >> 6);
- *g = Clamp((int32_t)(y1 + YGB) >> 6);
- *r = Clamp((int32_t)(y1 + YGB) >> 6);
+// Reads 8 bit YUV and leaves result as 16 bit.
+static __inline void YPixel(uint8_t y,
+ uint8_t* b,
+ uint8_t* g,
+ uint8_t* r,
+ const struct YuvConstants* yuvconstants) {
+#if defined(__aarch64__) || defined(__arm__)
+ int ygb = yuvconstants->kUVBiasBGR[3];
+ int yg = yuvconstants->kYToRgb[1];
+#else
+ int ygb = yuvconstants->kYBiasToRgb[0];
+ int yg = yuvconstants->kYToRgb[0];
+#endif
+ uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
+ *b = Clamp(((int32_t)(y1) + ygb) >> 6);
+ *g = Clamp(((int32_t)(y1) + ygb) >> 6);
+ *r = Clamp(((int32_t)(y1) + ygb) >> 6);
}
-#undef YG
-#undef YGB
-
#if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON))
// C mimic assembly.
@@ -2136,18 +2377,21 @@ void I422ToRGBARow_C(const uint8_t* src_y,
}
}
-void I400ToARGBRow_C(const uint8_t* src_y, uint8_t* rgb_buf, int width) {
+void I400ToARGBRow_C(const uint8_t* src_y,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
- YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
rgb_buf[3] = 255;
- YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+ YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
rgb_buf[7] = 255;
src_y += 2;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
- YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
rgb_buf[3] = 255;
}
}
@@ -2165,10 +2409,21 @@ void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {
}
}
-void MirrorUVRow_C(const uint8_t* src_uv,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
+void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+ int x;
+ src_uv += (width - 1) << 1;
+ for (x = 0; x < width; ++x) {
+ dst_uv[0] = src_uv[0];
+ dst_uv[1] = src_uv[1];
+ src_uv -= 2;
+ dst_uv += 2;
+ }
+}
+
+void MirrorSplitUVRow_C(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
int x;
src_uv += (width - 1) << 1;
for (x = 0; x < width - 1; x += 2) {
@@ -2199,6 +2454,21 @@ void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {
}
}
+void RGB24MirrorRow_C(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width) {
+ int x;
+ src_rgb24 += width * 3 - 3;
+ for (x = 0; x < width; ++x) {
+ uint8_t b = src_rgb24[0];
+ uint8_t g = src_rgb24[1];
+ uint8_t r = src_rgb24[2];
+ dst_rgb24[0] = b;
+ dst_rgb24[1] = g;
+ dst_rgb24[2] = r;
+ src_rgb24 -= 3;
+ dst_rgb24 += 3;
+ }
+}
+
void SplitUVRow_C(const uint8_t* src_uv,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -2338,10 +2608,9 @@ void SetRow_C(uint8_t* dst, uint8_t v8, int width) {
}
void ARGBSetRow_C(uint8_t* dst_argb, uint32_t v32, int width) {
- uint32_t* d = (uint32_t*)(dst_argb);
int x;
for (x = 0; x < width; ++x) {
- d[x] = v32;
+ memcpy(dst_argb + x * sizeof v32, &v32, sizeof v32);
}
}
@@ -2439,7 +2708,7 @@ void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
}
}
-#define BLEND(f, b, a) (((256 - a) * b) >> 8) + f
+#define BLEND(f, b, a) clamp255((((256 - a) * b) >> 8) + f)
// Blend src_argb0 over src_argb1 and store to dst_argb.
// dst_argb may be src_argb0 or src_argb1.
@@ -2515,10 +2784,14 @@ void BlendPlaneRow_C(const uint8_t* src0,
}
#undef UBLEND
+#if defined(__aarch64__) || defined(__arm__)
+#define ATTENUATE(f, a) (f * a + 128) >> 8
+#else
+// This code mimics the SSSE3 version for better testability.
#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24
+#endif
// Multiply source RGB by alpha and store to destination.
-// This code mimics the SSSE3 version for better testability.
void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
int i;
for (i = 0; i < width - 1; i += 2) {
@@ -3305,6 +3578,70 @@ void NV12ToRGB565Row_AVX2(const uint8_t* src_y,
}
#endif
+#ifdef HAS_RGB24TOYJROW_AVX2
+// Convert 16 RGB24 pixels (64 bytes) to 16 YJ values.
+void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth);
+ ARGBToYJRow_AVX2(row, dst_yj, twidth);
+ src_rgb24 += twidth * 3;
+ dst_yj += twidth;
+ width -= twidth;
+ }
+}
+#endif // HAS_RGB24TOYJROW_AVX2
+
+#ifdef HAS_RAWTOYJROW_AVX2
+// Convert 16 RAW pixels (64 bytes) to 16 YJ values.
+void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ RAWToARGBRow_SSSE3(src_raw, row, twidth);
+ ARGBToYJRow_AVX2(row, dst_yj, twidth);
+ src_raw += twidth * 3;
+ dst_yj += twidth;
+ width -= twidth;
+ }
+}
+#endif // HAS_RAWTOYJROW_AVX2
+
+#ifdef HAS_RGB24TOYJROW_SSSE3
+// Convert 16 RGB24 pixels (64 bytes) to 16 YJ values.
+void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth);
+ ARGBToYJRow_SSSE3(row, dst_yj, twidth);
+ src_rgb24 += twidth * 3;
+ dst_yj += twidth;
+ width -= twidth;
+ }
+}
+#endif // HAS_RGB24TOYJROW_SSSE3
+
+#ifdef HAS_RAWTOYJROW_SSSE3
+// Convert 16 RAW pixels (64 bytes) to 16 YJ values.
+void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ RAWToARGBRow_SSSE3(src_raw, row, twidth);
+ ARGBToYJRow_SSSE3(row, dst_yj, twidth);
+ src_raw += twidth * 3;
+ dst_yj += twidth;
+ width -= twidth;
+ }
+}
+#endif // HAS_RAWTOYJROW_SSSE3
+
float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) {
float fsum = 0.f;
int i;
@@ -3358,6 +3695,29 @@ void GaussCol_C(const uint16_t* src0,
}
}
+void GaussRow_F32_C(const float* src, float* dst, int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ *dst++ = (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4]) *
+ (1.0f / 256.0f);
+ ++src;
+ }
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_F32_C(const float* src0,
+ const float* src1,
+ const float* src2,
+ const float* src3,
+ const float* src4,
+ float* dst,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ *dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++;
+ }
+}
+
// Convert biplanar NV21 to packed YUV24
void NV21ToYUV24Row_C(const uint8_t* src_y,
const uint8_t* src_vu,
@@ -3459,6 +3819,30 @@ void SwapUVRow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
}
}
+void HalfMergeUVRow_C(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ dst_uv[0] = (src_u[0] + src_u[1] + src_u[src_stride_u] +
+ src_u[src_stride_u + 1] + 2) >>
+ 2;
+ dst_uv[1] = (src_v[0] + src_v[1] + src_v[src_stride_v] +
+ src_v[src_stride_v + 1] + 2) >>
+ 2;
+ src_u += 2;
+ src_v += 2;
+ dst_uv += 2;
+ }
+ if (width & 1) {
+ dst_uv[0] = (src_u[0] + src_u[src_stride_u] + 1) >> 1;
+ dst_uv[1] = (src_v[0] + src_v[src_stride_v] + 1) >> 1;
+ }
+}
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/TMessagesProj/jni/third_party/libyuv/source/row_gcc.cc b/TMessagesProj/jni/third_party/libyuv/source/row_gcc.cc
index 3088bb755..a107c30e7 100644
--- a/TMessagesProj/jni/third_party/libyuv/source/row_gcc.cc
+++ b/TMessagesProj/jni/third_party/libyuv/source/row_gcc.cc
@@ -159,24 +159,24 @@ static const lvec8 kShuffleNV21 = {
#ifdef HAS_J400TOARGBROW_SSE2
void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pslld $0x18,%%xmm5 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0x18,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movq (%0),%%xmm0 \n"
- "lea 0x8(%0),%0 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm0,%%xmm0 \n"
- "punpckhwd %%xmm1,%%xmm1 \n"
- "por %%xmm5,%%xmm0 \n"
- "por %%xmm5,%%xmm1 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movq (%0),%%xmm0 \n"
+ "lea 0x8(%0),%0 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm0,%%xmm0 \n"
+ "punpckhwd %%xmm1,%%xmm1 \n"
+ "por %%xmm5,%%xmm0 \n"
+ "por %%xmm5,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -190,35 +190,35 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
uint8_t* dst_argb,
int width) {
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000
- "pslld $0x18,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000
+ "pslld $0x18,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm3 \n"
- "lea 0x30(%0),%0 \n"
- "movdqa %%xmm3,%%xmm2 \n"
- "palignr $0x8,%%xmm1,%%xmm2 \n"
- "pshufb %%xmm4,%%xmm2 \n"
- "por %%xmm5,%%xmm2 \n"
- "palignr $0xc,%%xmm0,%%xmm1 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "movdqu %%xmm2,0x20(%1) \n"
- "por %%xmm5,%%xmm0 \n"
- "pshufb %%xmm4,%%xmm1 \n"
- "movdqu %%xmm0,(%1) \n"
- "por %%xmm5,%%xmm1 \n"
- "palignr $0x4,%%xmm3,%%xmm3 \n"
- "pshufb %%xmm4,%%xmm3 \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "por %%xmm5,%%xmm3 \n"
- "movdqu %%xmm3,0x30(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm3 \n"
+ "lea 0x30(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "palignr $0x8,%%xmm1,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqu %%xmm2,0x20(%1) \n"
+ "por %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "palignr $0x4,%%xmm3,%%xmm3 \n"
+ "pshufb %%xmm4,%%xmm3 \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqu %%xmm3,0x30(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -228,35 +228,35 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000
- "pslld $0x18,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000
+ "pslld $0x18,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm3 \n"
- "lea 0x30(%0),%0 \n"
- "movdqa %%xmm3,%%xmm2 \n"
- "palignr $0x8,%%xmm1,%%xmm2 \n"
- "pshufb %%xmm4,%%xmm2 \n"
- "por %%xmm5,%%xmm2 \n"
- "palignr $0xc,%%xmm0,%%xmm1 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "movdqu %%xmm2,0x20(%1) \n"
- "por %%xmm5,%%xmm0 \n"
- "pshufb %%xmm4,%%xmm1 \n"
- "movdqu %%xmm0,(%1) \n"
- "por %%xmm5,%%xmm1 \n"
- "palignr $0x4,%%xmm3,%%xmm3 \n"
- "pshufb %%xmm4,%%xmm3 \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "por %%xmm5,%%xmm3 \n"
- "movdqu %%xmm3,0x30(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm3 \n"
+ "lea 0x30(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "palignr $0x8,%%xmm1,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqu %%xmm2,0x20(%1) \n"
+ "por %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "palignr $0x4,%%xmm3,%%xmm3 \n"
+ "pshufb %%xmm4,%%xmm3 \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqu %%xmm3,0x30(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -267,35 +267,35 @@ void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
// Same code as RAWToARGB with different shuffler and A in low bits
void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n" // 0x000000ff
- "psrld $0x18,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n" // 0x000000ff
+ "psrld $0x18,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm3 \n"
- "lea 0x30(%0),%0 \n"
- "movdqa %%xmm3,%%xmm2 \n"
- "palignr $0x8,%%xmm1,%%xmm2 \n"
- "pshufb %%xmm4,%%xmm2 \n"
- "por %%xmm5,%%xmm2 \n"
- "palignr $0xc,%%xmm0,%%xmm1 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "movdqu %%xmm2,0x20(%1) \n"
- "por %%xmm5,%%xmm0 \n"
- "pshufb %%xmm4,%%xmm1 \n"
- "movdqu %%xmm0,(%1) \n"
- "por %%xmm5,%%xmm1 \n"
- "palignr $0x4,%%xmm3,%%xmm3 \n"
- "pshufb %%xmm4,%%xmm3 \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "por %%xmm5,%%xmm3 \n"
- "movdqu %%xmm3,0x30(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm3 \n"
+ "lea 0x30(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "palignr $0x8,%%xmm1,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqu %%xmm2,0x20(%1) \n"
+ "por %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "palignr $0x4,%%xmm3,%%xmm3 \n"
+ "pshufb %%xmm4,%%xmm3 \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqu %%xmm3,0x30(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_rgba), // %1
"+r"(width) // %2
@@ -307,25 +307,25 @@ void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
uint8_t* dst_rgb24,
int width) {
asm volatile(
- "movdqa %3,%%xmm3 \n"
- "movdqa %4,%%xmm4 \n"
- "movdqa %5,%%xmm5 \n"
+ "movdqa %3,%%xmm3 \n"
+ "movdqa %4,%%xmm4 \n"
+ "movdqa %5,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x4(%0),%%xmm1 \n"
- "movdqu 0x8(%0),%%xmm2 \n"
- "lea 0x18(%0),%0 \n"
- "pshufb %%xmm3,%%xmm0 \n"
- "pshufb %%xmm4,%%xmm1 \n"
- "pshufb %%xmm5,%%xmm2 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,0x8(%1) \n"
- "movq %%xmm2,0x10(%1) \n"
- "lea 0x18(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x4(%0),%%xmm1 \n"
+ "movdqu 0x8(%0),%%xmm2 \n"
+ "lea 0x18(%0),%0 \n"
+ "pshufb %%xmm3,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm2 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x8(%1) \n"
+ "movq %%xmm2,0x10(%1) \n"
+ "lea 0x18(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_rgb24), // %1
"+r"(width) // %2
@@ -337,44 +337,44 @@ void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "mov $0x1080108,%%eax \n"
- "movd %%eax,%%xmm5 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "mov $0x20802080,%%eax \n"
- "movd %%eax,%%xmm6 \n"
- "pshufd $0x0,%%xmm6,%%xmm6 \n"
- "pcmpeqb %%xmm3,%%xmm3 \n"
- "psllw $0xb,%%xmm3 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psllw $0xa,%%xmm4 \n"
- "psrlw $0x5,%%xmm4 \n"
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "psllw $0x8,%%xmm7 \n"
- "sub %0,%1 \n"
- "sub %0,%1 \n"
+ "mov $0x1080108,%%eax \n"
+ "movd %%eax,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "mov $0x20802080,%%eax \n"
+ "movd %%eax,%%xmm6 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psllw $0xb,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psllw $0xa,%%xmm4 \n"
+ "psrlw $0x5,%%xmm4 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psllw $0x8,%%xmm7 \n"
+ "sub %0,%1 \n"
+ "sub %0,%1 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "pand %%xmm3,%%xmm1 \n"
- "psllw $0xb,%%xmm2 \n"
- "pmulhuw %%xmm5,%%xmm1 \n"
- "pmulhuw %%xmm5,%%xmm2 \n"
- "psllw $0x8,%%xmm1 \n"
- "por %%xmm2,%%xmm1 \n"
- "pand %%xmm4,%%xmm0 \n"
- "pmulhuw %%xmm6,%%xmm0 \n"
- "por %%xmm7,%%xmm0 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm0,%%xmm1 \n"
- "punpckhbw %%xmm0,%%xmm2 \n"
- "movdqu %%xmm1,0x00(%1,%0,2) \n"
- "movdqu %%xmm2,0x10(%1,%0,2) \n"
- "lea 0x10(%0),%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "psllw $0xb,%%xmm2 \n"
+ "pmulhuw %%xmm5,%%xmm1 \n"
+ "pmulhuw %%xmm5,%%xmm2 \n"
+ "psllw $0x8,%%xmm1 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "pmulhuw %%xmm6,%%xmm0 \n"
+ "por %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpckhbw %%xmm0,%%xmm2 \n"
+ "movdqu %%xmm1,0x00(%1,%0,2) \n"
+ "movdqu %%xmm2,0x10(%1,%0,2) \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -385,47 +385,47 @@ void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "mov $0x1080108,%%eax \n"
- "movd %%eax,%%xmm5 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "mov $0x42004200,%%eax \n"
- "movd %%eax,%%xmm6 \n"
- "pshufd $0x0,%%xmm6,%%xmm6 \n"
- "pcmpeqb %%xmm3,%%xmm3 \n"
- "psllw $0xb,%%xmm3 \n"
- "movdqa %%xmm3,%%xmm4 \n"
- "psrlw $0x6,%%xmm4 \n"
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "psllw $0x8,%%xmm7 \n"
- "sub %0,%1 \n"
- "sub %0,%1 \n"
+ "mov $0x1080108,%%eax \n"
+ "movd %%eax,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "mov $0x42004200,%%eax \n"
+ "movd %%eax,%%xmm6 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psllw $0xb,%%xmm3 \n"
+ "movdqa %%xmm3,%%xmm4 \n"
+ "psrlw $0x6,%%xmm4 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psllw $0x8,%%xmm7 \n"
+ "sub %0,%1 \n"
+ "sub %0,%1 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "psllw $0x1,%%xmm1 \n"
- "psllw $0xb,%%xmm2 \n"
- "pand %%xmm3,%%xmm1 \n"
- "pmulhuw %%xmm5,%%xmm2 \n"
- "pmulhuw %%xmm5,%%xmm1 \n"
- "psllw $0x8,%%xmm1 \n"
- "por %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "pand %%xmm4,%%xmm0 \n"
- "psraw $0x8,%%xmm2 \n"
- "pmulhuw %%xmm6,%%xmm0 \n"
- "pand %%xmm7,%%xmm2 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm0,%%xmm1 \n"
- "punpckhbw %%xmm0,%%xmm2 \n"
- "movdqu %%xmm1,0x00(%1,%0,2) \n"
- "movdqu %%xmm2,0x10(%1,%0,2) \n"
- "lea 0x10(%0),%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "psllw $0x1,%%xmm1 \n"
+ "psllw $0xb,%%xmm2 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "pmulhuw %%xmm5,%%xmm2 \n"
+ "pmulhuw %%xmm5,%%xmm1 \n"
+ "psllw $0x8,%%xmm1 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "psraw $0x8,%%xmm2 \n"
+ "pmulhuw %%xmm6,%%xmm0 \n"
+ "pand %%xmm7,%%xmm2 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpckhbw %%xmm0,%%xmm2 \n"
+ "movdqu %%xmm1,0x00(%1,%0,2) \n"
+ "movdqu %%xmm2,0x10(%1,%0,2) \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -436,34 +436,34 @@ void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "mov $0xf0f0f0f,%%eax \n"
- "movd %%eax,%%xmm4 \n"
- "pshufd $0x0,%%xmm4,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "pslld $0x4,%%xmm5 \n"
- "sub %0,%1 \n"
- "sub %0,%1 \n"
+ "mov $0xf0f0f0f,%%eax \n"
+ "movd %%eax,%%xmm4 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "pslld $0x4,%%xmm5 \n"
+ "sub %0,%1 \n"
+ "sub %0,%1 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "pand %%xmm4,%%xmm0 \n"
- "pand %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "psllw $0x4,%%xmm1 \n"
- "psrlw $0x4,%%xmm3 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm3,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm2,%%xmm0 \n"
- "punpckhbw %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0,0x00(%1,%0,2) \n"
- "movdqu %%xmm1,0x10(%1,%0,2) \n"
- "lea 0x10(%0),%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "pand %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "psllw $0x4,%%xmm1 \n"
+ "psrlw $0x4,%%xmm3 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0,0x00(%1,%0,2) \n"
+ "movdqu %%xmm1,0x10(%1,%0,2) \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -474,35 +474,35 @@ void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "movdqa %3,%%xmm6 \n"
+ "movdqa %3,%%xmm6 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "pshufb %%xmm6,%%xmm0 \n"
- "pshufb %%xmm6,%%xmm1 \n"
- "pshufb %%xmm6,%%xmm2 \n"
- "pshufb %%xmm6,%%xmm3 \n"
- "movdqa %%xmm1,%%xmm4 \n"
- "psrldq $0x4,%%xmm1 \n"
- "pslldq $0xc,%%xmm4 \n"
- "movdqa %%xmm2,%%xmm5 \n"
- "por %%xmm4,%%xmm0 \n"
- "pslldq $0x8,%%xmm5 \n"
- "movdqu %%xmm0,(%1) \n"
- "por %%xmm5,%%xmm1 \n"
- "psrldq $0x8,%%xmm2 \n"
- "pslldq $0x4,%%xmm3 \n"
- "por %%xmm3,%%xmm2 \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "movdqu %%xmm2,0x20(%1) \n"
- "lea 0x30(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "pshufb %%xmm6,%%xmm0 \n"
+ "pshufb %%xmm6,%%xmm1 \n"
+ "pshufb %%xmm6,%%xmm2 \n"
+ "pshufb %%xmm6,%%xmm3 \n"
+ "movdqa %%xmm1,%%xmm4 \n"
+ "psrldq $0x4,%%xmm1 \n"
+ "pslldq $0xc,%%xmm4 \n"
+ "movdqa %%xmm2,%%xmm5 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pslldq $0x8,%%xmm5 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "psrldq $0x8,%%xmm2 \n"
+ "pslldq $0x4,%%xmm3 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "movdqu %%xmm2,0x20(%1) \n"
+ "lea 0x30(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -513,35 +513,35 @@ void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "movdqa %3,%%xmm6 \n"
+ "movdqa %3,%%xmm6 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "pshufb %%xmm6,%%xmm0 \n"
- "pshufb %%xmm6,%%xmm1 \n"
- "pshufb %%xmm6,%%xmm2 \n"
- "pshufb %%xmm6,%%xmm3 \n"
- "movdqa %%xmm1,%%xmm4 \n"
- "psrldq $0x4,%%xmm1 \n"
- "pslldq $0xc,%%xmm4 \n"
- "movdqa %%xmm2,%%xmm5 \n"
- "por %%xmm4,%%xmm0 \n"
- "pslldq $0x8,%%xmm5 \n"
- "movdqu %%xmm0,(%1) \n"
- "por %%xmm5,%%xmm1 \n"
- "psrldq $0x8,%%xmm2 \n"
- "pslldq $0x4,%%xmm3 \n"
- "por %%xmm3,%%xmm2 \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "movdqu %%xmm2,0x20(%1) \n"
- "lea 0x30(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "pshufb %%xmm6,%%xmm0 \n"
+ "pshufb %%xmm6,%%xmm1 \n"
+ "pshufb %%xmm6,%%xmm2 \n"
+ "pshufb %%xmm6,%%xmm3 \n"
+ "movdqa %%xmm1,%%xmm4 \n"
+ "psrldq $0x4,%%xmm1 \n"
+ "pslldq $0xc,%%xmm4 \n"
+ "movdqa %%xmm2,%%xmm5 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pslldq $0x8,%%xmm5 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "psrldq $0x8,%%xmm2 \n"
+ "pslldq $0x4,%%xmm3 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "movdqu %%xmm2,0x20(%1) \n"
+ "lea 0x30(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -556,37 +556,37 @@ static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7};
void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm6 \n"
- "vmovdqa %4,%%ymm7 \n"
+ "vmovdqa %4,%%ymm7 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x40(%0),%%ymm2 \n"
- "vmovdqu 0x60(%0),%%ymm3 \n"
- "lea 0x80(%0),%0 \n"
- "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
- "vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
- "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
- "vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
- "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
- "vpermd %%ymm1,%%ymm7,%%ymm1 \n"
- "vpermd %%ymm2,%%ymm7,%%ymm2 \n"
- "vpermd %%ymm3,%%ymm7,%%ymm3 \n"
- "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
- "vpor %%ymm4,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
- "vpermq $0x4f,%%ymm2,%%ymm4 \n"
- "vpor %%ymm4,%%ymm1,%%ymm1 \n"
- "vmovdqu %%ymm1,0x20(%1) \n"
- "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
- "vpermq $0x93,%%ymm3,%%ymm3 \n"
- "vpor %%ymm3,%%ymm2,%%ymm2 \n"
- "vmovdqu %%ymm2,0x40(%1) \n"
- "lea 0x60(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
+ "vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
+ "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
+ "vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
+ "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
+ "vpermd %%ymm1,%%ymm7,%%ymm1 \n"
+ "vpermd %%ymm2,%%ymm7,%%ymm2 \n"
+ "vpermd %%ymm3,%%ymm7,%%ymm3 \n"
+ "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
+ "vpor %%ymm4,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
+ "vpermq $0x4f,%%ymm2,%%ymm4 \n"
+ "vpor %%ymm4,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
+ "vpermq $0x93,%%ymm3,%%ymm3 \n"
+ "vpor %%ymm3,%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm2,0x40(%1) \n"
+ "lea 0x60(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -615,26 +615,26 @@ static const ulvec8 kPermARGBToRGB24_2 = {
void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "vmovdqa %3,%%ymm5 \n"
- "vmovdqa %4,%%ymm6 \n"
- "vmovdqa %5,%%ymm7 \n"
+ "vmovdqa %3,%%ymm5 \n"
+ "vmovdqa %4,%%ymm6 \n"
+ "vmovdqa %5,%%ymm7 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x40(%0),%%ymm2 \n"
- "vmovdqu 0x60(%0),%%ymm3 \n"
- "lea 0x80(%0),%0 \n"
- "vpermt2b %%ymm1,%%ymm5,%%ymm0 \n"
- "vpermt2b %%ymm2,%%ymm6,%%ymm1 \n"
- "vpermt2b %%ymm3,%%ymm7,%%ymm2 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vmovdqu %%ymm1,0x20(%1) \n"
- "vmovdqu %%ymm2,0x40(%1) \n"
- "lea 0x60(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vpermt2b %%ymm1,%%ymm5,%%ymm0 \n"
+ "vpermt2b %%ymm2,%%ymm6,%%ymm1 \n"
+ "vpermt2b %%ymm3,%%ymm7,%%ymm2 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "vmovdqu %%ymm2,0x40(%1) \n"
+ "lea 0x60(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -650,37 +650,37 @@ void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm6 \n"
- "vmovdqa %4,%%ymm7 \n"
+ "vmovdqa %4,%%ymm7 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x40(%0),%%ymm2 \n"
- "vmovdqu 0x60(%0),%%ymm3 \n"
- "lea 0x80(%0),%0 \n"
- "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
- "vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
- "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
- "vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
- "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
- "vpermd %%ymm1,%%ymm7,%%ymm1 \n"
- "vpermd %%ymm2,%%ymm7,%%ymm2 \n"
- "vpermd %%ymm3,%%ymm7,%%ymm3 \n"
- "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
- "vpor %%ymm4,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
- "vpermq $0x4f,%%ymm2,%%ymm4 \n"
- "vpor %%ymm4,%%ymm1,%%ymm1 \n"
- "vmovdqu %%ymm1,0x20(%1) \n"
- "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
- "vpermq $0x93,%%ymm3,%%ymm3 \n"
- "vpor %%ymm3,%%ymm2,%%ymm2 \n"
- "vmovdqu %%ymm2,0x40(%1) \n"
- "lea 0x60(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
+ "vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
+ "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
+ "vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
+ "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
+ "vpermd %%ymm1,%%ymm7,%%ymm1 \n"
+ "vpermd %%ymm2,%%ymm7,%%ymm2 \n"
+ "vpermd %%ymm3,%%ymm7,%%ymm3 \n"
+ "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
+ "vpor %%ymm4,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
+ "vpermq $0x4f,%%ymm2,%%ymm4 \n"
+ "vpor %%ymm4,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
+ "vpermq $0x93,%%ymm3,%%ymm3 \n"
+ "vpor %%ymm3,%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm2,0x40(%1) \n"
+ "lea 0x60(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -694,34 +694,34 @@ void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "pcmpeqb %%xmm3,%%xmm3 \n"
- "psrld $0x1b,%%xmm3 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrld $0x1a,%%xmm4 \n"
- "pslld $0x5,%%xmm4 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pslld $0xb,%%xmm5 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psrld $0x1b,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrld $0x1a,%%xmm4 \n"
+ "pslld $0x5,%%xmm4 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0xb,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "pslld $0x8,%%xmm0 \n"
- "psrld $0x3,%%xmm1 \n"
- "psrld $0x5,%%xmm2 \n"
- "psrad $0x10,%%xmm0 \n"
- "pand %%xmm3,%%xmm1 \n"
- "pand %%xmm4,%%xmm2 \n"
- "pand %%xmm5,%%xmm0 \n"
- "por %%xmm2,%%xmm1 \n"
- "por %%xmm1,%%xmm0 \n"
- "packssdw %%xmm0,%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pslld $0x8,%%xmm0 \n"
+ "psrld $0x3,%%xmm1 \n"
+ "psrld $0x5,%%xmm2 \n"
+ "psrad $0x10,%%xmm0 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "pand %%xmm4,%%xmm2 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -734,40 +734,40 @@ void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
const uint32_t dither4,
int width) {
asm volatile(
- "movd %3,%%xmm6 \n"
- "punpcklbw %%xmm6,%%xmm6 \n"
- "movdqa %%xmm6,%%xmm7 \n"
- "punpcklwd %%xmm6,%%xmm6 \n"
- "punpckhwd %%xmm7,%%xmm7 \n"
- "pcmpeqb %%xmm3,%%xmm3 \n"
- "psrld $0x1b,%%xmm3 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrld $0x1a,%%xmm4 \n"
- "pslld $0x5,%%xmm4 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pslld $0xb,%%xmm5 \n"
+ "movd %3,%%xmm6 \n"
+ "punpcklbw %%xmm6,%%xmm6 \n"
+ "movdqa %%xmm6,%%xmm7 \n"
+ "punpcklwd %%xmm6,%%xmm6 \n"
+ "punpckhwd %%xmm7,%%xmm7 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psrld $0x1b,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrld $0x1a,%%xmm4 \n"
+ "pslld $0x5,%%xmm4 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0xb,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "paddusb %%xmm6,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "pslld $0x8,%%xmm0 \n"
- "psrld $0x3,%%xmm1 \n"
- "psrld $0x5,%%xmm2 \n"
- "psrad $0x10,%%xmm0 \n"
- "pand %%xmm3,%%xmm1 \n"
- "pand %%xmm4,%%xmm2 \n"
- "pand %%xmm5,%%xmm0 \n"
- "por %%xmm2,%%xmm1 \n"
- "por %%xmm1,%%xmm0 \n"
- "packssdw %%xmm0,%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "paddusb %%xmm6,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pslld $0x8,%%xmm0 \n"
+ "psrld $0x3,%%xmm1 \n"
+ "psrld $0x5,%%xmm2 \n"
+ "psrad $0x10,%%xmm0 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "pand %%xmm4,%%xmm2 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -783,35 +783,35 @@ void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
int width) {
asm volatile(
"vbroadcastss %3,%%xmm6 \n"
- "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n"
- "vpermq $0xd8,%%ymm6,%%ymm6 \n"
- "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n"
- "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n"
- "vpsrld $0x1b,%%ymm3,%%ymm3 \n"
- "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpsrld $0x1a,%%ymm4,%%ymm4 \n"
- "vpslld $0x5,%%ymm4,%%ymm4 \n"
- "vpslld $0xb,%%ymm3,%%ymm5 \n"
+ "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n"
+ "vpermq $0xd8,%%ymm6,%%ymm6 \n"
+ "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n"
+ "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n"
+ "vpsrld $0x1b,%%ymm3,%%ymm3 \n"
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrld $0x1a,%%ymm4,%%ymm4 \n"
+ "vpslld $0x5,%%ymm4,%%ymm4 \n"
+ "vpslld $0xb,%%ymm3,%%ymm5 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n"
- "vpsrld $0x5,%%ymm0,%%ymm2 \n"
- "vpsrld $0x3,%%ymm0,%%ymm1 \n"
- "vpsrld $0x8,%%ymm0,%%ymm0 \n"
- "vpand %%ymm4,%%ymm2,%%ymm2 \n"
- "vpand %%ymm3,%%ymm1,%%ymm1 \n"
- "vpand %%ymm5,%%ymm0,%%ymm0 \n"
- "vpor %%ymm2,%%ymm1,%%ymm1 \n"
- "vpor %%ymm1,%%ymm0,%%ymm0 \n"
- "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "lea 0x20(%0),%0 \n"
- "vmovdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n"
+ "vpsrld $0x5,%%ymm0,%%ymm2 \n"
+ "vpsrld $0x3,%%ymm0,%%ymm1 \n"
+ "vpsrld $0x8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpand %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpor %%ymm2,%%ymm1,%%ymm1 \n"
+ "vpor %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "lea 0x20(%0),%0 \n"
+ "vmovdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -824,38 +824,38 @@ void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrld $0x1b,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "pslld $0x5,%%xmm5 \n"
- "movdqa %%xmm4,%%xmm6 \n"
- "pslld $0xa,%%xmm6 \n"
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "pslld $0xf,%%xmm7 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrld $0x1b,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "pslld $0x5,%%xmm5 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "pslld $0xa,%%xmm6 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "pslld $0xf,%%xmm7 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm3 \n"
- "psrad $0x10,%%xmm0 \n"
- "psrld $0x3,%%xmm1 \n"
- "psrld $0x6,%%xmm2 \n"
- "psrld $0x9,%%xmm3 \n"
- "pand %%xmm7,%%xmm0 \n"
- "pand %%xmm4,%%xmm1 \n"
- "pand %%xmm5,%%xmm2 \n"
- "pand %%xmm6,%%xmm3 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm3,%%xmm2 \n"
- "por %%xmm2,%%xmm0 \n"
- "packssdw %%xmm0,%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm3 \n"
+ "psrad $0x10,%%xmm0 \n"
+ "psrld $0x3,%%xmm1 \n"
+ "psrld $0x6,%%xmm2 \n"
+ "psrld $0x9,%%xmm3 \n"
+ "pand %%xmm7,%%xmm0 \n"
+ "pand %%xmm4,%%xmm1 \n"
+ "pand %%xmm5,%%xmm2 \n"
+ "pand %%xmm6,%%xmm3 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -865,26 +865,26 @@ void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psllw $0xc,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm3 \n"
- "psrlw $0x8,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psllw $0xc,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm3 \n"
+ "psrlw $0x8,%%xmm3 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm3,%%xmm0 \n"
- "pand %%xmm4,%%xmm1 \n"
- "psrlq $0x4,%%xmm0 \n"
- "psrlq $0x8,%%xmm1 \n"
- "por %%xmm1,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm3,%%xmm0 \n"
+ "pand %%xmm4,%%xmm1 \n"
+ "psrlq $0x4,%%xmm0 \n"
+ "psrlq $0x8,%%xmm1 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -928,31 +928,31 @@ static const uint32_t kMulAG10 = 64 * 65536 + 1028;
void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "movdqa %3,%%xmm2 \n" // shuffler for RB
- "movd %4,%%xmm3 \n" // multipler for RB
- "movd %5,%%xmm4 \n" // mask for R10 B10
- "movd %6,%%xmm5 \n" // mask for AG
- "movd %7,%%xmm6 \n" // multipler for AG
- "pshufd $0x0,%%xmm3,%%xmm3 \n"
- "pshufd $0x0,%%xmm4,%%xmm4 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "pshufd $0x0,%%xmm6,%%xmm6 \n"
- "sub %0,%1 \n"
+ "movdqa %3,%%xmm2 \n" // shuffler for RB
+ "movd %4,%%xmm3 \n" // multipler for RB
+ "movd %5,%%xmm4 \n" // mask for R10 B10
+ "movd %6,%%xmm5 \n" // mask for AG
+ "movd %7,%%xmm6 \n" // multipler for AG
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "sub %0,%1 \n"
- "1: \n"
- "movdqu (%0),%%xmm0 \n" // fetch 4 ARGB pixels
- "movdqa %%xmm0,%%xmm1 \n"
- "pshufb %%xmm2,%%xmm1 \n" // R0B0
- "pand %%xmm5,%%xmm0 \n" // A0G0
- "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
- "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
- "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
- "pslld $10,%%xmm0 \n" // A2 x10 G10 x10
- "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
- "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
- "add $0x10,%0 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n" // fetch 4 ARGB pixels
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pshufb %%xmm2,%%xmm1 \n" // R0B0
+ "pand %%xmm5,%%xmm0 \n" // A0G0
+ "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
+ "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
+ "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
+ "pslld $10,%%xmm0 \n" // A2 x10 G10 x10
+ "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
+ "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
+ "add $0x10,%0 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -967,31 +967,31 @@ void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "movdqa %3,%%xmm2 \n" // shuffler for RB
- "movd %4,%%xmm3 \n" // multipler for RB
- "movd %5,%%xmm4 \n" // mask for R10 B10
- "movd %6,%%xmm5 \n" // mask for AG
- "movd %7,%%xmm6 \n" // multipler for AG
- "pshufd $0x0,%%xmm3,%%xmm3 \n"
- "pshufd $0x0,%%xmm4,%%xmm4 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "pshufd $0x0,%%xmm6,%%xmm6 \n"
- "sub %0,%1 \n"
+ "movdqa %3,%%xmm2 \n" // shuffler for RB
+ "movd %4,%%xmm3 \n" // multipler for RB
+ "movd %5,%%xmm4 \n" // mask for R10 B10
+ "movd %6,%%xmm5 \n" // mask for AG
+ "movd %7,%%xmm6 \n" // multipler for AG
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "sub %0,%1 \n"
- "1: \n"
- "movdqu (%0),%%xmm0 \n" // fetch 4 ABGR pixels
- "movdqa %%xmm0,%%xmm1 \n"
- "pshufb %%xmm2,%%xmm1 \n" // R0B0
- "pand %%xmm5,%%xmm0 \n" // A0G0
- "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
- "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
- "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
- "pslld $10,%%xmm0 \n" // A2 x10 G10 x10
- "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
- "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
- "add $0x10,%0 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n" // fetch 4 ABGR pixels
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pshufb %%xmm2,%%xmm1 \n" // R0B0
+ "pand %%xmm5,%%xmm0 \n" // A0G0
+ "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
+ "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
+ "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
+ "pslld $10,%%xmm0 \n" // A2 x10 G10 x10
+ "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
+ "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
+ "add $0x10,%0 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -1008,25 +1008,25 @@ void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB
- "vbroadcastss %4,%%ymm3 \n" // multipler for RB
- "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
- "vbroadcastss %6,%%ymm5 \n" // mask for AG
- "vbroadcastss %7,%%ymm6 \n" // multipler for AG
- "sub %0,%1 \n"
+ "vbroadcastss %4,%%ymm3 \n" // multipler for RB
+ "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
+ "vbroadcastss %6,%%ymm5 \n" // mask for AG
+ "vbroadcastss %7,%%ymm6 \n" // multipler for AG
+ "sub %0,%1 \n"
"1: \n"
- "vmovdqu (%0),%%ymm0 \n" // fetch 8 ARGB pixels
- "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0
- "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0
- "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10
- "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10
- "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10
- "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10
- "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10
- "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels
- "add $0x20,%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n" // fetch 8 ARGB pixels
+ "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10
+ "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10
+ "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10
+ "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10
+ "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10
+ "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels
+ "add $0x20,%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
@@ -1045,25 +1045,25 @@ void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB
- "vbroadcastss %4,%%ymm3 \n" // multipler for RB
- "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
- "vbroadcastss %6,%%ymm5 \n" // mask for AG
- "vbroadcastss %7,%%ymm6 \n" // multipler for AG
- "sub %0,%1 \n"
+ "vbroadcastss %4,%%ymm3 \n" // multipler for RB
+ "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
+ "vbroadcastss %6,%%ymm5 \n" // mask for AG
+ "vbroadcastss %7,%%ymm6 \n" // multipler for AG
+ "sub %0,%1 \n"
"1: \n"
- "vmovdqu (%0),%%ymm0 \n" // fetch 8 ABGR pixels
- "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0
- "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0
- "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10
- "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10
- "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10
- "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10
- "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10
- "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels
- "add $0x20,%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n" // fetch 8 ABGR pixels
+ "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10
+ "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10
+ "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10
+ "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10
+ "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10
+ "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels
+ "add $0x20,%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
@@ -1078,6 +1078,8 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
}
#endif
+// clang-format off
+
// TODO(mraptis): Consider passing R, G, B multipliers as parameter.
// round parameter is register containing value to add before shift.
#define RGBTOY(round) \
@@ -1101,10 +1103,9 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
"lea 0x40(%0),%0 \n" \
"phaddw %%xmm0,%%xmm6 \n" \
"phaddw %%xmm2,%%xmm1 \n" \
- "paddw %%" #round \
- ",%%xmm6 \n" \
- "paddw %%" #round \
- ",%%xmm1 \n" \
+ "prefetcht0 1280(%0) \n" \
+ "paddw %%" #round ",%%xmm6 \n" \
+ "paddw %%" #round ",%%xmm1 \n" \
"psrlw $0x8,%%xmm6 \n" \
"psrlw $0x8,%%xmm1 \n" \
"packuswb %%xmm1,%%xmm6 \n" \
@@ -1130,10 +1131,9 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
"lea 0x80(%0),%0 \n" \
"vphaddw %%ymm1,%%ymm0,%%ymm0 \n" /* mutates. */ \
"vphaddw %%ymm3,%%ymm2,%%ymm2 \n" \
- "vpaddw %%" #round \
- ",%%ymm0,%%ymm0 \n" /* Add .5 for rounding. */ \
- "vpaddw %%" #round \
- ",%%ymm2,%%ymm2 \n" \
+ "prefetcht0 1280(%0) \n" \
+ "vpaddw %%" #round ",%%ymm0,%%ymm0 \n" /* Add .5 for rounding. */ \
+ "vpaddw %%" #round ",%%ymm2,%%ymm2 \n" \
"vpsrlw $0x8,%%ymm0,%%ymm0 \n" \
"vpsrlw $0x8,%%ymm2,%%ymm2 \n" \
"vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" /* mutates. */ \
@@ -1144,13 +1144,15 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
"jg 1b \n" \
"vzeroupper \n"
+// clang-format on
+
#ifdef HAS_ARGBTOYROW_SSSE3
// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile(
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
- "movdqa %5,%%xmm7 \n"
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %5,%%xmm7 \n"
LABELALIGN RGBTOY(xmm7)
: "+r"(src_argb), // %0
@@ -1169,8 +1171,8 @@ void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
// Same as ARGBToYRow but different coefficients, no add 16.
void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile(
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
LABELALIGN RGBTOY(xmm5)
: "+r"(src_argb), // %0
@@ -1187,8 +1189,8 @@ void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
// Same as ARGBToYRow but different coefficients, no add 16.
void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
asm volatile(
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
LABELALIGN RGBTOY(xmm5)
: "+r"(src_rgba), // %0
@@ -1210,7 +1212,7 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
"vbroadcastf128 %3,%%ymm4 \n"
"vbroadcastf128 %4,%%ymm5 \n"
"vbroadcastf128 %5,%%ymm7 \n"
- "vmovdqu %6,%%ymm6 \n"
+ "vmovdqu %6,%%ymm6 \n"
LABELALIGN RGBTOY_AVX2(ymm7)
: "+r"(src_argb), // %0
@@ -1232,7 +1234,7 @@ void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
"vbroadcastf128 %3,%%ymm4 \n"
"vbroadcastf128 %4,%%ymm5 \n"
"vbroadcastf128 %5,%%ymm7 \n"
- "vmovdqu %6,%%ymm6 \n"
+ "vmovdqu %6,%%ymm6 \n"
LABELALIGN RGBTOY_AVX2(ymm7)
: "+r"(src_abgr), // %0
@@ -1253,7 +1255,7 @@ void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm4 \n"
"vbroadcastf128 %4,%%ymm5 \n"
- "vmovdqu %5,%%ymm6 \n"
+ "vmovdqu %5,%%ymm6 \n"
LABELALIGN RGBTOY_AVX2(ymm5)
: "+r"(src_argb), // %0
@@ -1273,7 +1275,7 @@ void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm4 \n"
"vbroadcastf128 %4,%%ymm5 \n"
- "vmovdqu %5,%%ymm6 \n"
+ "vmovdqu %5,%%ymm6 \n"
LABELALIGN RGBTOY_AVX2(
ymm5) "vzeroupper \n"
@@ -1294,52 +1296,52 @@ void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
uint8_t* dst_v,
int width) {
asm volatile(
- "movdqa %5,%%xmm3 \n"
- "movdqa %6,%%xmm4 \n"
- "movdqa %7,%%xmm5 \n"
- "sub %1,%2 \n"
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x10(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x20(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "movdqu 0x30(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
- "lea 0x40(%0),%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_argb0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -1366,44 +1368,44 @@ void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
"vbroadcastf128 %5,%%ymm5 \n"
"vbroadcastf128 %6,%%ymm6 \n"
"vbroadcastf128 %7,%%ymm7 \n"
- "sub %1,%2 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x40(%0),%%ymm2 \n"
- "vmovdqu 0x60(%0),%%ymm3 \n"
- "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
- "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
- "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
- "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
- "lea 0x80(%0),%0 \n"
- "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
- "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
- "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
- "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
- "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
- "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
+ "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
+ "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
+ "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
+ "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
+ "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
- "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
- "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
- "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
- "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpsraw $0x8,%%ymm1,%%ymm1 \n"
- "vpsraw $0x8,%%ymm0,%%ymm0 \n"
- "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpshufb %8,%%ymm0,%%ymm0 \n"
- "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
+ "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
+ "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
+ "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpsraw $0x8,%%ymm1,%%ymm1 \n"
+ "vpsraw $0x8,%%ymm0,%%ymm0 \n"
+ "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpshufb %8,%%ymm0,%%ymm0 \n"
+ "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm0,(%1) \n"
"vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb0), // %0
"+r"(dst_u), // %1
@@ -1429,44 +1431,44 @@ void ABGRToUVRow_AVX2(const uint8_t* src_abgr0,
"vbroadcastf128 %5,%%ymm5 \n"
"vbroadcastf128 %6,%%ymm6 \n"
"vbroadcastf128 %7,%%ymm7 \n"
- "sub %1,%2 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x40(%0),%%ymm2 \n"
- "vmovdqu 0x60(%0),%%ymm3 \n"
- "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
- "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
- "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
- "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
- "lea 0x80(%0),%0 \n"
- "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
- "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
- "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
- "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
- "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
- "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
+ "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
+ "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
+ "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
+ "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
+ "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
- "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
- "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
- "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
- "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpsraw $0x8,%%ymm1,%%ymm1 \n"
- "vpsraw $0x8,%%ymm0,%%ymm0 \n"
- "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpshufb %8,%%ymm0,%%ymm0 \n"
- "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
+ "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
+ "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
+ "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpsraw $0x8,%%ymm1,%%ymm1 \n"
+ "vpsraw $0x8,%%ymm0,%%ymm0 \n"
+ "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpshufb %8,%%ymm0,%%ymm0 \n"
+ "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm0,(%1) \n"
"vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_abgr0), // %0
"+r"(dst_u), // %1
@@ -1492,45 +1494,45 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
"vbroadcastf128 %5,%%ymm5 \n"
"vbroadcastf128 %6,%%ymm6 \n"
"vbroadcastf128 %7,%%ymm7 \n"
- "sub %1,%2 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x40(%0),%%ymm2 \n"
- "vmovdqu 0x60(%0),%%ymm3 \n"
- "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
- "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
- "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
- "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
- "lea 0x80(%0),%0 \n"
- "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
- "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
- "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
- "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
- "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
- "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
+ "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
+ "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
+ "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
+ "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
+ "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
- "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
- "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
- "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
- "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm5,%%ymm1,%%ymm1 \n"
- "vpsraw $0x8,%%ymm1,%%ymm1 \n"
- "vpsraw $0x8,%%ymm0,%%ymm0 \n"
- "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpshufb %8,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
+ "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
+ "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
+ "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpsraw $0x8,%%ymm1,%%ymm1 \n"
+ "vpsraw $0x8,%%ymm0,%%ymm0 \n"
+ "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpshufb %8,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm0,(%1) \n"
"vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb0), // %0
"+r"(dst_u), // %1
@@ -1553,53 +1555,53 @@ void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
uint8_t* dst_v,
int width) {
asm volatile(
- "movdqa %5,%%xmm3 \n"
- "movdqa %6,%%xmm4 \n"
- "movdqa %7,%%xmm5 \n"
- "sub %1,%2 \n"
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x10(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x20(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "movdqu 0x30(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
- "lea 0x40(%0),%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "paddw %%xmm5,%%xmm0 \n"
- "paddw %%xmm5,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "paddw %%xmm5,%%xmm0 \n"
+ "paddw %%xmm5,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_argb0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -1618,47 +1620,47 @@ void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
uint8_t* dst_v,
int width) {
asm volatile(
- "movdqa %4,%%xmm3 \n"
- "movdqa %5,%%xmm4 \n"
- "movdqa %6,%%xmm5 \n"
- "sub %1,%2 \n"
+ "movdqa %4,%%xmm3 \n"
+ "movdqa %5,%%xmm4 \n"
+ "movdqa %6,%%xmm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm6 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm2 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm2 \n"
- "packsswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "pmaddubsw %%xmm3,%%xmm0 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm2 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm2 \n"
- "packsswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "lea 0x40(%0),%0 \n"
- "movdqu %%xmm0,0x00(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm6 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm2 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm2 \n"
+ "packsswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "pmaddubsw %%xmm3,%%xmm0 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm2 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm2 \n"
+ "packsswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "lea 0x40(%0),%0 \n"
+ "movdqu %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -1672,9 +1674,9 @@ void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
asm volatile(
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
- "movdqa %5,%%xmm7 \n"
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %5,%%xmm7 \n"
LABELALIGN RGBTOY(xmm7)
: "+r"(src_bgra), // %0
@@ -1693,52 +1695,52 @@ void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
uint8_t* dst_v,
int width) {
asm volatile(
- "movdqa %5,%%xmm3 \n"
- "movdqa %6,%%xmm4 \n"
- "movdqa %7,%%xmm5 \n"
- "sub %1,%2 \n"
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x10(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x20(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "movdqu 0x30(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
- "lea 0x40(%0),%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_bgra0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -1752,9 +1754,9 @@ void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
asm volatile(
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
- "movdqa %5,%%xmm7 \n"
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %5,%%xmm7 \n"
LABELALIGN RGBTOY(xmm7)
: "+r"(src_abgr), // %0
@@ -1769,9 +1771,9 @@ void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
asm volatile(
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
- "movdqa %5,%%xmm7 \n"
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %5,%%xmm7 \n"
LABELALIGN RGBTOY(xmm7)
: "+r"(src_rgba), // %0
@@ -1790,52 +1792,52 @@ void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
uint8_t* dst_v,
int width) {
asm volatile(
- "movdqa %5,%%xmm3 \n"
- "movdqa %6,%%xmm4 \n"
- "movdqa %7,%%xmm5 \n"
- "sub %1,%2 \n"
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x10(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x20(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "movdqu 0x30(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
- "lea 0x40(%0),%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_abgr0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -1853,52 +1855,52 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
uint8_t* dst_v,
int width) {
asm volatile(
- "movdqa %5,%%xmm3 \n"
- "movdqa %6,%%xmm4 \n"
- "movdqa %7,%%xmm5 \n"
- "sub %1,%2 \n"
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x10(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x20(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "movdqu 0x30(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
- "lea 0x40(%0),%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_rgba0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -2115,16 +2117,16 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV444
YUVTORGB(yuvconstants)
STOREARGB
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2144,27 +2146,27 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
- "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
- "sub %[u_buf],%[v_buf] \n"
+ "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
+ "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
+ "sub %[u_buf],%[v_buf] \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV422
YUVTORGB(yuvconstants)
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm2,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "pshufb %%xmm5,%%xmm0 \n"
- "pshufb %%xmm6,%%xmm1 \n"
- "palignr $0xc,%%xmm0,%%xmm1 \n"
- "movq %%xmm0,(%[dst_rgb24]) \n"
- "movdqu %%xmm1,0x8(%[dst_rgb24]) \n"
- "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n"
- "subl $0x8,%[width] \n"
- "jg 1b \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm2,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm6,%%xmm1 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "movq %%xmm0,(%[dst_rgb24]) \n"
+ "movdqu %%xmm1,0x8(%[dst_rgb24]) \n"
+ "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n"
+ "subl $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2190,16 +2192,16 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV422
YUVTORGB(yuvconstants)
STOREARGB
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2219,21 +2221,21 @@ void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants
- "psrlw $14,%%xmm5 \n"
- "psllw $4,%%xmm5 \n" // 2 alpha bits
- "pxor %%xmm6,%%xmm6 \n"
- "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min
- "psrlw $6,%%xmm7 \n" // 1023 for max
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants
+ "psrlw $14,%%xmm5 \n"
+ "psllw $4,%%xmm5 \n" // 2 alpha bits
+ "pxor %%xmm6,%%xmm6 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min
+ "psrlw $6,%%xmm7 \n" // 1023 for max
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV422
YUVTORGB16(yuvconstants)
STOREAR30
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2254,16 +2256,16 @@ void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV210
YUVTORGB(yuvconstants)
STOREARGB
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2284,21 +2286,21 @@ void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $14,%%xmm5 \n"
- "psllw $4,%%xmm5 \n" // 2 alpha bits
- "pxor %%xmm6,%%xmm6 \n"
- "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min
- "psrlw $6,%%xmm7 \n" // 1023 for max
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $14,%%xmm5 \n"
+ "psllw $4,%%xmm5 \n" // 2 alpha bits
+ "pxor %%xmm6,%%xmm6 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min
+ "psrlw $6,%%xmm7 \n" // 1023 for max
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV210
YUVTORGB16(yuvconstants)
STOREAR30
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2321,15 +2323,15 @@ void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
// clang-format off
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
+ "sub %[u_buf],%[v_buf] \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUVA422
YUVTORGB(yuvconstants)
STOREARGB
- "subl $0x8,%[width] \n"
- "jg 1b \n"
+ "subl $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2356,15 +2358,15 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf,
// clang-format off
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READNV12
YUVTORGB(yuvconstants)
STOREARGB
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[uv_buf]"+r"(uv_buf), // %[uv_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
@@ -2384,15 +2386,15 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
// clang-format off
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READNV21
YUVTORGB(yuvconstants)
STOREARGB
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[vu_buf]"+r"(vu_buf), // %[vu_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
@@ -2412,15 +2414,15 @@ void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
// clang-format off
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUY2
YUVTORGB(yuvconstants)
STOREARGB
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
@@ -2440,15 +2442,15 @@ void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
// clang-format off
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READUYVY
YUVTORGB(yuvconstants)
STOREARGB
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
@@ -2469,16 +2471,16 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV422
YUVTORGB(yuvconstants)
STORERGBA
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2693,17 +2695,17 @@ void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV444_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2727,18 +2729,18 @@ void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV422_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
- "vzeroupper \n"
+ "vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2762,23 +2764,23 @@ void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
- "vpsrlw $14,%%ymm5,%%ymm5 \n"
- "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
- "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
- "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
- "vpsrlw $6,%%ymm7,%%ymm7 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
+ "vpsrlw $14,%%ymm5,%%ymm5 \n"
+ "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
+ "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
+ "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
+ "vpsrlw $6,%%ymm7,%%ymm7 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV422_AVX2
YUVTORGB16_AVX2(yuvconstants)
STOREAR30_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
- "vzeroupper \n"
+ "vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2802,18 +2804,18 @@ void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV210_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
- "vzeroupper \n"
+ "vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2837,23 +2839,23 @@ void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
- "vpsrlw $14,%%ymm5,%%ymm5 \n"
- "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
- "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
- "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
- "vpsrlw $6,%%ymm7,%%ymm7 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
+ "vpsrlw $14,%%ymm5,%%ymm5 \n"
+ "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
+ "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
+ "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
+ "vpsrlw $6,%%ymm7,%%ymm7 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV210_AVX2
YUVTORGB16_AVX2(yuvconstants)
STOREAR30_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
- "vzeroupper \n"
+ "vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2879,16 +2881,16 @@ void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,
// clang-format off
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
+ "sub %[u_buf],%[v_buf] \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUVA422_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
- "subl $0x10,%[width] \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "subl $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2918,11 +2920,11 @@ void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV422_AVX2
YUVTORGB_AVX2(yuvconstants)
@@ -2962,16 +2964,16 @@ void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf,
// clang-format off
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READNV12_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[uv_buf]"+r"(uv_buf), // %[uv_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
@@ -2995,16 +2997,16 @@ void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf,
// clang-format off
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READNV21_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[vu_buf]"+r"(vu_buf), // %[vu_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
@@ -3028,16 +3030,16 @@ void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
// clang-format off
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUY2_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
@@ -3061,16 +3063,16 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
// clang-format off
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READUYVY_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
@@ -3085,17 +3087,15 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
#endif // HAS_UYVYTOARGBROW_AVX2
#ifdef HAS_I400TOARGBROW_SSE2
-void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
+void I400ToARGBRow_SSE2(const uint8_t* y_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
asm volatile(
- "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164
- "movd %%eax,%%xmm2 \n"
- "pshufd $0x0,%%xmm2,%%xmm2 \n"
- "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 *
- // 16
- "movd %%eax,%%xmm3 \n"
- "pshufd $0x0,%%xmm3,%%xmm3 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "pslld $0x18,%%xmm4 \n"
+ "movdqa 192(%3),%%xmm2 \n" // yg = 18997 = 1.164
+ "movdqa 224(%3),%%xmm3 \n" // ygb = 1160 = 1.164 * 16
+ "pcmpeqb %%xmm4,%%xmm4 \n" // 0xff000000
+ "pslld $0x18,%%xmm4 \n"
LABELALIGN
"1: \n"
@@ -3104,8 +3104,8 @@ void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
"lea 0x8(%0),%0 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
"pmulhuw %%xmm2,%%xmm0 \n"
- "psubusw %%xmm3,%%xmm0 \n"
- "psrlw $6, %%xmm0 \n"
+ "paddsw %%xmm3,%%xmm0 \n"
+ "psraw $6, %%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
// Step 2: Weave into ARGB
@@ -3121,28 +3121,26 @@ void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
"sub $0x8,%2 \n"
"jg 1b \n"
- : "+r"(y_buf), // %0
- "+r"(dst_argb), // %1
- "+rm"(width) // %2
- :
- : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+ : "+r"(y_buf), // %0
+ "+r"(dst_argb), // %1
+ "+rm"(width) // %2
+ : "r"(yuvconstants) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
}
#endif // HAS_I400TOARGBROW_SSE2
#ifdef HAS_I400TOARGBROW_AVX2
// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
// note: vpunpcklbw mutates and vpackuswb unmutates.
-void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
+void I400ToARGBRow_AVX2(const uint8_t* y_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
asm volatile(
- "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 *
- // 16
- "vmovd %%eax,%%xmm2 \n"
- "vbroadcastss %%xmm2,%%ymm2 \n"
- "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164
- "vmovd %%eax,%%xmm3 \n"
- "vbroadcastss %%xmm3,%%ymm3 \n"
- "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpslld $0x18,%%ymm4,%%ymm4 \n"
+ "vmovdqa 192(%3),%%ymm2 \n" // yg = 18997 = 1.164
+ "vmovdqa 224(%3),%%ymm3 \n" // ygb = -1160 = 1.164*16
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 0xff000000
+ "vpslld $0x18,%%ymm4,%%ymm4 \n"
LABELALIGN
"1: \n"
@@ -3152,8 +3150,8 @@ void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
"vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpsubusw %%ymm3,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x6,%%ymm0,%%ymm0 \n"
+ "vpaddsw %%ymm3,%%ymm0,%%ymm0 \n"
+ "vpsraw $0x6,%%ymm0,%%ymm0 \n"
"vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
"vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n"
"vpermq $0xd8,%%ymm1,%%ymm1 \n"
@@ -3163,15 +3161,15 @@ void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
"vpor %%ymm4,%%ymm1,%%ymm1 \n"
"vmovdqu %%ymm0,(%1) \n"
"vmovdqu %%ymm1,0x20(%1) \n"
- "lea 0x40(%1),%1 \n"
+ "lea 0x40(%1),%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
"vzeroupper \n"
- : "+r"(y_buf), // %0
- "+r"(dst_argb), // %1
- "+rm"(width) // %2
- :
- : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+ : "+r"(y_buf), // %0
+ "+r"(dst_argb), // %1
+ "+rm"(width) // %2
+ : "r"(yuvconstants) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
}
#endif // HAS_I400TOARGBROW_AVX2
@@ -3184,16 +3182,16 @@ void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
intptr_t temp_width = (intptr_t)(width);
asm volatile(
- "movdqa %3,%%xmm5 \n"
+ "movdqa %3,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu -0x10(%0,%2,1),%%xmm0 \n"
- "pshufb %%xmm5,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu -0x10(%0,%2,1),%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(temp_width) // %2
@@ -3211,13 +3209,13 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
LABELALIGN
"1: \n"
- "vmovdqu -0x20(%0,%2,1),%%ymm0 \n"
- "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
- "vpermq $0x4e,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vmovdqu -0x20(%0,%2,1),%%ymm0 \n"
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpermq $0x4e,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -3228,55 +3226,154 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
#endif // HAS_MIRRORROW_AVX2
#ifdef HAS_MIRRORUVROW_SSSE3
-// Shuffle table for reversing the bytes of UV channels.
-static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
- 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
-void MirrorUVRow_SSSE3(const uint8_t* src,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
+// Shuffle table for reversing the UV.
+static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
+ 6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u};
+
+void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
intptr_t temp_width = (intptr_t)(width);
asm volatile(
- "movdqa %4,%%xmm1 \n"
- "lea -0x10(%0,%3,2),%0 \n"
- "sub %1,%2 \n"
+
+ "movdqa %3,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "lea -0x10(%0),%0 \n"
- "pshufb %%xmm1,%%xmm0 \n"
- "movlpd %%xmm0,(%1) \n"
- "movhpd %%xmm0,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $8,%3 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(temp_width) // %3
- : "m"(kShuffleMirrorUV) // %4
- : "memory", "cc", "xmm0", "xmm1");
+ "movdqu -0x10(%0,%2,2),%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_uv), // %1
+ "+r"(temp_width) // %2
+ : "m"(kShuffleMirrorUV) // %3
+ : "memory", "cc", "xmm0", "xmm5");
}
#endif // HAS_MIRRORUVROW_SSSE3
+#ifdef HAS_MIRRORUVROW_AVX2
+void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ asm volatile(
+
+ "vbroadcastf128 %3,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu -0x20(%0,%2,2),%%ymm0 \n"
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpermq $0x4e,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_uv), // %1
+ "+r"(temp_width) // %2
+ : "m"(kShuffleMirrorUV) // %3
+ : "memory", "cc", "xmm0", "xmm5");
+}
+#endif // HAS_MIRRORUVROW_AVX2
+
+#ifdef HAS_MIRRORSPLITUVROW_SSSE3
+// Shuffle table for reversing the bytes of UV channels.
+static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
+ 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
+void MirrorSplitUVRow_SSSE3(const uint8_t* src,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ asm volatile(
+ "movdqa %4,%%xmm1 \n"
+ "lea -0x10(%0,%3,2),%0 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "lea -0x10(%0),%0 \n"
+ "pshufb %%xmm1,%%xmm0 \n"
+ "movlpd %%xmm0,(%1) \n"
+ "movhpd %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $8,%3 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(temp_width) // %3
+ : "m"(kShuffleMirrorSplitUV) // %4
+ : "memory", "cc", "xmm0", "xmm1");
+}
+#endif // HAS_MIRRORSPLITUVROW_SSSE3
+
+#ifdef HAS_RGB24MIRRORROW_SSSE3
+
+// Shuffle first 5 pixels to last 5 mirrored. first byte zero
+static const uvec8 kShuffleMirrorRGB0 = {128u, 12u, 13u, 14u, 9u, 10u, 11u, 6u,
+ 7u, 8u, 3u, 4u, 5u, 0u, 1u, 2u};
+
+// Shuffle last 5 pixels to first 5 mirrored. last byte zero
+static const uvec8 kShuffleMirrorRGB1 = {
+ 13u, 14u, 15u, 10u, 11u, 12u, 7u, 8u, 9u, 4u, 5u, 6u, 1u, 2u, 3u, 128u};
+
+// Shuffle 5 pixels at a time (15 bytes)
+void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
+ uint8_t* dst_rgb24,
+ int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ src_rgb24 += width * 3 - 48;
+ asm volatile(
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n" // first 5
+ "movdqu 15(%0),%%xmm1 \n" // next 5
+ "movdqu 30(%0),%%xmm2 \n" // next 5
+ "movdqu 32(%0),%%xmm3 \n" // last 1 special
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm2 \n"
+ "pshufb %%xmm5,%%xmm3 \n"
+ "lea -0x30(%0),%0 \n"
+ "movdqu %%xmm0,32(%1) \n" // last 5
+ "movdqu %%xmm1,17(%1) \n" // next 5
+ "movdqu %%xmm2,2(%1) \n" // next 5
+ "movlpd %%xmm3,0(%1) \n" // first 1
+ "lea 0x30(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(temp_width) // %2
+ : "m"(kShuffleMirrorRGB0), // %3
+ "m"(kShuffleMirrorRGB1) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif // HAS_RGB24MIRRORROW_SSSE3
+
#ifdef HAS_ARGBMIRRORROW_SSE2
void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
intptr_t temp_width = (intptr_t)(width);
asm volatile(
- "lea -0x10(%0,%2,4),%0 \n"
+ "lea -0x10(%0,%2,4),%0 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "pshufd $0x1b,%%xmm0,%%xmm0 \n"
- "lea -0x10(%0),%0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "pshufd $0x1b,%%xmm0,%%xmm0 \n"
+ "lea -0x10(%0),%0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(temp_width) // %2
@@ -3292,15 +3389,15 @@ void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
intptr_t temp_width = (intptr_t)(width);
asm volatile(
- "vmovdqu %3,%%ymm5 \n"
+ "vmovdqu %3,%%ymm5 \n"
LABELALIGN
"1: \n"
- "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -3316,28 +3413,28 @@ void SplitUVRow_AVX2(const uint8_t* src_uv,
uint8_t* dst_v,
int width) {
asm volatile(
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
- "sub %1,%2 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm2 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm3 \n"
- "vpand %%ymm5,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm2,%%ymm2 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vmovdqu %%ymm2,0x00(%1,%2,1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm2 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm3 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm2,0x00(%1,%2,1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
@@ -3354,28 +3451,28 @@ void SplitUVRow_SSE2(const uint8_t* src_uv,
uint8_t* dst_v,
int width) {
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "psrlw $0x8,%%xmm2 \n"
- "psrlw $0x8,%%xmm3 \n"
- "packuswb %%xmm3,%%xmm2 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm2,0x00(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "psrlw $0x8,%%xmm3 \n"
+ "packuswb %%xmm3,%%xmm2 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm2,0x00(%1,%2,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -3392,22 +3489,22 @@ void MergeUVRow_AVX2(const uint8_t* src_u,
int width) {
asm volatile(
- "sub %0,%1 \n"
+ "sub %0,%1 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x00(%0,%1,1),%%ymm1 \n"
- "lea 0x20(%0),%0 \n"
- "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n"
- "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x00(%0,%1,1),%%ymm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n"
+ "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm2,(%2) \n"
"vextractf128 $0x0,%%ymm0,0x10(%2) \n"
"vextractf128 $0x1,%%ymm2,0x20(%2) \n"
"vextractf128 $0x1,%%ymm0,0x30(%2) \n"
- "lea 0x40(%2),%2 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
+ "lea 0x40(%2),%2 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
@@ -3425,21 +3522,21 @@ void MergeUVRow_SSE2(const uint8_t* src_u,
int width) {
asm volatile(
- "sub %0,%1 \n"
+ "sub %0,%1 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%1,1),%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm2 \n"
- "movdqu %%xmm0,(%2) \n"
- "movdqu %%xmm2,0x10(%2) \n"
- "lea 0x20(%2),%2 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%1,1),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm2 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "movdqu %%xmm2,0x10(%2) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
@@ -3462,30 +3559,30 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u,
int width) {
// clang-format off
asm volatile (
- "vmovd %4,%%xmm3 \n"
- "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
- "vbroadcastss %%xmm3,%%ymm3 \n"
- "sub %0,%1 \n"
+ "vmovd %4,%%xmm3 \n"
+ "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
+ "vbroadcastss %%xmm3,%%ymm3 \n"
+ "sub %0,%1 \n"
// 16 pixels per loop.
LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu (%0,%1,1),%%ymm1 \n"
- "add $0x20,%0 \n"
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu (%0,%1,1),%%ymm1 \n"
+ "add $0x20,%0 \n"
- "vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
- "vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
- "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates
- "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n"
- "vextractf128 $0x0,%%ymm2,(%2) \n"
- "vextractf128 $0x0,%%ymm0,0x10(%2) \n"
- "vextractf128 $0x1,%%ymm2,0x20(%2) \n"
- "vextractf128 $0x1,%%ymm0,0x30(%2) \n"
- "add $0x40,%2 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
+ "vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates
+ "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n"
+ "vextractf128 $0x0,%%ymm2,(%2) \n"
+ "vextractf128 $0x0,%%ymm0,0x10(%2) \n"
+ "vextractf128 $0x1,%%ymm2,0x20(%2) \n"
+ "vextractf128 $0x1,%%ymm0,0x30(%2) \n"
+ "add $0x40,%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
@@ -3508,24 +3605,24 @@ void MultiplyRow_16_AVX2(const uint16_t* src_y,
int width) {
// clang-format off
asm volatile (
- "vmovd %3,%%xmm3 \n"
- "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
- "vbroadcastss %%xmm3,%%ymm3 \n"
- "sub %0,%1 \n"
+ "vmovd %3,%%xmm3 \n"
+ "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
+ "vbroadcastss %%xmm3,%%ymm3 \n"
+ "sub %0,%1 \n"
// 16 pixels per loop.
LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
- "vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
- "vmovdqu %%ymm0,(%0,%1) \n"
- "vmovdqu %%ymm1,0x20(%0,%1) \n"
- "add $0x40,%0 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
+ "vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%0,%1) \n"
+ "vmovdqu %%ymm1,0x20(%0,%1) \n"
+ "add $0x40,%0 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -3546,23 +3643,23 @@ void Convert16To8Row_SSSE3(const uint16_t* src_y,
int width) {
// clang-format off
asm volatile (
- "movd %3,%%xmm2 \n"
- "punpcklwd %%xmm2,%%xmm2 \n"
- "pshufd $0x0,%%xmm2,%%xmm2 \n"
+ "movd %3,%%xmm2 \n"
+ "punpcklwd %%xmm2,%%xmm2 \n"
+ "pshufd $0x0,%%xmm2,%%xmm2 \n"
// 32 pixels per loop.
LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "add $0x20,%0 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "add $0x10,%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "add $0x20,%0 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "add $0x10,%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -3578,25 +3675,25 @@ void Convert16To8Row_AVX2(const uint16_t* src_y,
int width) {
// clang-format off
asm volatile (
- "vmovd %3,%%xmm2 \n"
- "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
- "vbroadcastss %%xmm2,%%ymm2 \n"
+ "vmovd %3,%%xmm2 \n"
+ "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
+ "vbroadcastss %%xmm2,%%ymm2 \n"
// 32 pixels per loop.
LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "add $0x40,%0 \n"
- "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" // mutates
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "add $0x20,%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "add $0x40,%0 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" // mutates
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "add $0x20,%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -3617,25 +3714,25 @@ void Convert8To16Row_SSE2(const uint8_t* src_y,
int width) {
// clang-format off
asm volatile (
- "movd %3,%%xmm2 \n"
- "punpcklwd %%xmm2,%%xmm2 \n"
- "pshufd $0x0,%%xmm2,%%xmm2 \n"
+ "movd %3,%%xmm2 \n"
+ "punpcklwd %%xmm2,%%xmm2 \n"
+ "pshufd $0x0,%%xmm2,%%xmm2 \n"
// 32 pixels per loop.
LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm1 \n"
- "add $0x10,%0 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "add $0x20,%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "add $0x10,%0 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "add $0x20,%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -3651,26 +3748,26 @@ void Convert8To16Row_AVX2(const uint8_t* src_y,
int width) {
// clang-format off
asm volatile (
- "vmovd %3,%%xmm2 \n"
- "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
- "vbroadcastss %%xmm2,%%ymm2 \n"
+ "vmovd %3,%%xmm2 \n"
+ "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
+ "vbroadcastss %%xmm2,%%ymm2 \n"
// 32 pixels per loop.
LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "add $0x20,%0 \n"
- "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n"
- "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
- "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vmovdqu %%ymm1,0x20(%1) \n"
- "add $0x40,%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "add $0x20,%0 \n"
+ "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n"
+ "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "add $0x40,%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -3722,41 +3819,41 @@ void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "pshufb %5, %%xmm0 \n"
- "pshufb %6, %%xmm1 \n"
- "pshufb %7, %%xmm2 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "pshufb %5, %%xmm0 \n"
+ "pshufb %6, %%xmm1 \n"
+ "pshufb %7, %%xmm2 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "pshufb %8, %%xmm0 \n"
- "pshufb %9, %%xmm1 \n"
- "pshufb %10, %%xmm2 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "pshufb %8, %%xmm0 \n"
+ "pshufb %9, %%xmm1 \n"
+ "pshufb %10, %%xmm2 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "pshufb %11, %%xmm0 \n"
- "pshufb %12, %%xmm1 \n"
- "pshufb %13, %%xmm2 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%3) \n"
- "lea 0x10(%3),%3 \n"
- "lea 0x30(%0),%0 \n"
- "sub $0x10,%4 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "pshufb %11, %%xmm0 \n"
+ "pshufb %12, %%xmm1 \n"
+ "pshufb %13, %%xmm2 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%3) \n"
+ "lea 0x10(%3),%3 \n"
+ "lea 0x30(%0),%0 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
: "+r"(src_rgb), // %0
"+r"(dst_r), // %1
"+r"(dst_g), // %2
@@ -3817,42 +3914,42 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r,
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu (%1),%%xmm1 \n"
- "movdqu (%2),%%xmm2 \n"
- "pshufb %5, %%xmm0 \n"
- "pshufb %6, %%xmm1 \n"
- "pshufb %7, %%xmm2 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%3) \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "movdqu (%2),%%xmm2 \n"
+ "pshufb %5, %%xmm0 \n"
+ "pshufb %6, %%xmm1 \n"
+ "pshufb %7, %%xmm2 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%3) \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu (%1),%%xmm1 \n"
- "movdqu (%2),%%xmm2 \n"
- "pshufb %8, %%xmm0 \n"
- "pshufb %9, %%xmm1 \n"
- "pshufb %10, %%xmm2 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,16(%3) \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "movdqu (%2),%%xmm2 \n"
+ "pshufb %8, %%xmm0 \n"
+ "pshufb %9, %%xmm1 \n"
+ "pshufb %10, %%xmm2 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,16(%3) \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu (%1),%%xmm1 \n"
- "movdqu (%2),%%xmm2 \n"
- "pshufb %11, %%xmm0 \n"
- "pshufb %12, %%xmm1 \n"
- "pshufb %13, %%xmm2 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,32(%3) \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "movdqu (%2),%%xmm2 \n"
+ "pshufb %11, %%xmm0 \n"
+ "pshufb %12, %%xmm1 \n"
+ "pshufb %13, %%xmm2 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,32(%3) \n"
- "lea 0x10(%0),%0 \n"
- "lea 0x10(%1),%1 \n"
- "lea 0x10(%2),%2 \n"
- "lea 0x30(%3),%3 \n"
- "sub $0x10,%4 \n"
- "jg 1b \n"
+ "lea 0x10(%0),%0 \n"
+ "lea 0x10(%1),%1 \n"
+ "lea 0x10(%2),%2 \n"
+ "lea 0x30(%3),%3 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
@@ -3874,35 +3971,35 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r,
#ifdef HAS_COPYROW_SSE2
void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "test $0xf,%0 \n"
- "jne 2f \n"
- "test $0xf,%1 \n"
- "jne 2f \n"
+ "test $0xf,%0 \n"
+ "jne 2f \n"
+ "test $0xf,%1 \n"
+ "jne 2f \n"
LABELALIGN
"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "movdqa %%xmm0,(%1) \n"
- "movdqa %%xmm1,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "jmp 9f \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "movdqa %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "jmp 9f \n"
LABELALIGN
"2: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 2b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 2b \n"
- LABELALIGN "9: \n"
+ LABELALIGN "9: \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -3917,14 +4014,14 @@ void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vmovdqu %%ymm1,0x20(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x40,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x40,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -3939,7 +4036,7 @@ void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
size_t width_tmp = (size_t)(width);
asm volatile(
- "rep movsb \n"
+ "rep movsb \n"
: "+S"(src), // %0
"+D"(dst), // %1
"+c"(width_tmp) // %2
@@ -3952,29 +4049,29 @@ void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
// width in pixels
void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "pcmpeqb %%xmm0,%%xmm0 \n"
- "pslld $0x18,%%xmm0 \n"
- "pcmpeqb %%xmm1,%%xmm1 \n"
- "psrld $0x8,%%xmm1 \n"
+ "pcmpeqb %%xmm0,%%xmm0 \n"
+ "pslld $0x18,%%xmm0 \n"
+ "pcmpeqb %%xmm1,%%xmm1 \n"
+ "psrld $0x8,%%xmm1 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm2 \n"
- "movdqu 0x10(%0),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "movdqu (%1),%%xmm4 \n"
- "movdqu 0x10(%1),%%xmm5 \n"
- "pand %%xmm0,%%xmm2 \n"
- "pand %%xmm0,%%xmm3 \n"
- "pand %%xmm1,%%xmm4 \n"
- "pand %%xmm1,%%xmm5 \n"
- "por %%xmm4,%%xmm2 \n"
- "por %%xmm5,%%xmm3 \n"
- "movdqu %%xmm2,(%1) \n"
- "movdqu %%xmm3,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm2 \n"
+ "movdqu 0x10(%0),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqu (%1),%%xmm4 \n"
+ "movdqu 0x10(%1),%%xmm5 \n"
+ "pand %%xmm0,%%xmm2 \n"
+ "pand %%xmm0,%%xmm3 \n"
+ "pand %%xmm1,%%xmm4 \n"
+ "pand %%xmm1,%%xmm5 \n"
+ "por %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqu %%xmm2,(%1) \n"
+ "movdqu %%xmm3,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -3987,21 +4084,21 @@ void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
// width in pixels
void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpsrld $0x8,%%ymm0,%%ymm0 \n"
+ "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpsrld $0x8,%%ymm0,%%ymm0 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm1 \n"
- "vmovdqu 0x20(%0),%%ymm2 \n"
- "lea 0x40(%0),%0 \n"
- "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n"
- "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
- "vmovdqu %%ymm1,(%1) \n"
- "vmovdqu %%ymm2,0x20(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm1 \n"
+ "vmovdqu 0x20(%0),%%ymm2 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n"
+ "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm1,(%1) \n"
+ "vmovdqu %%ymm2,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -4020,17 +4117,17 @@ void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
LABELALIGN
"1: \n"
- "movdqu (%0), %%xmm0 \n"
- "movdqu 0x10(%0), %%xmm1 \n"
- "lea 0x20(%0), %0 \n"
- "psrld $0x18, %%xmm0 \n"
- "psrld $0x18, %%xmm1 \n"
- "packssdw %%xmm1, %%xmm0 \n"
- "packuswb %%xmm0, %%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1), %1 \n"
- "sub $0x8, %2 \n"
- "jg 1b \n"
+ "movdqu (%0), %%xmm0 \n"
+ "movdqu 0x10(%0), %%xmm1 \n"
+ "lea 0x20(%0), %0 \n"
+ "psrld $0x18, %%xmm0 \n"
+ "psrld $0x18, %%xmm1 \n"
+ "packssdw %%xmm1, %%xmm0 \n"
+ "packuswb %%xmm0, %%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1), %1 \n"
+ "sub $0x8, %2 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_a), // %1
"+rm"(width) // %2
@@ -4048,28 +4145,28 @@ void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
uint8_t* dst_a,
int width) {
asm volatile(
- "vmovdqa %3,%%ymm4 \n"
+ "vmovdqa %3,%%ymm4 \n"
"vbroadcastf128 %4,%%ymm5 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0), %%ymm0 \n"
- "vmovdqu 0x20(%0), %%ymm1 \n"
- "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0
- "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
- "vmovdqu 0x40(%0), %%ymm2 \n"
- "vmovdqu 0x60(%0), %%ymm3 \n"
- "lea 0x80(%0), %0 \n"
- "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates
- "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
- "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
- "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates
- "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
- "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate.
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20, %2 \n"
- "jg 1b \n"
+ "vmovdqu (%0), %%ymm0 \n"
+ "vmovdqu 0x20(%0), %%ymm1 \n"
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0
+ "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
+ "vmovdqu 0x40(%0), %%ymm2 \n"
+ "vmovdqu 0x60(%0), %%ymm3 \n"
+ "lea 0x80(%0), %0 \n"
+ "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates
+ "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
+ "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates
+ "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
+ "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate.
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20, %2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb), // %0
"+r"(dst_a), // %1
@@ -4084,31 +4181,31 @@ void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
// width in pixels
void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "pcmpeqb %%xmm0,%%xmm0 \n"
- "pslld $0x18,%%xmm0 \n"
- "pcmpeqb %%xmm1,%%xmm1 \n"
- "psrld $0x8,%%xmm1 \n"
+ "pcmpeqb %%xmm0,%%xmm0 \n"
+ "pslld $0x18,%%xmm0 \n"
+ "pcmpeqb %%xmm1,%%xmm1 \n"
+ "psrld $0x8,%%xmm1 \n"
LABELALIGN
"1: \n"
- "movq (%0),%%xmm2 \n"
- "lea 0x8(%0),%0 \n"
- "punpcklbw %%xmm2,%%xmm2 \n"
- "punpckhwd %%xmm2,%%xmm3 \n"
- "punpcklwd %%xmm2,%%xmm2 \n"
- "movdqu (%1),%%xmm4 \n"
- "movdqu 0x10(%1),%%xmm5 \n"
- "pand %%xmm0,%%xmm2 \n"
- "pand %%xmm0,%%xmm3 \n"
- "pand %%xmm1,%%xmm4 \n"
- "pand %%xmm1,%%xmm5 \n"
- "por %%xmm4,%%xmm2 \n"
- "por %%xmm5,%%xmm3 \n"
- "movdqu %%xmm2,(%1) \n"
- "movdqu %%xmm3,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movq (%0),%%xmm2 \n"
+ "lea 0x8(%0),%0 \n"
+ "punpcklbw %%xmm2,%%xmm2 \n"
+ "punpckhwd %%xmm2,%%xmm3 \n"
+ "punpcklwd %%xmm2,%%xmm2 \n"
+ "movdqu (%1),%%xmm4 \n"
+ "movdqu 0x10(%1),%%xmm5 \n"
+ "pand %%xmm0,%%xmm2 \n"
+ "pand %%xmm0,%%xmm3 \n"
+ "pand %%xmm1,%%xmm4 \n"
+ "pand %%xmm1,%%xmm5 \n"
+ "por %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqu %%xmm2,(%1) \n"
+ "movdqu %%xmm3,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -4121,23 +4218,23 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
// width in pixels
void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpsrld $0x8,%%ymm0,%%ymm0 \n"
+ "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpsrld $0x8,%%ymm0,%%ymm0 \n"
LABELALIGN
"1: \n"
- "vpmovzxbd (%0),%%ymm1 \n"
- "vpmovzxbd 0x8(%0),%%ymm2 \n"
- "lea 0x10(%0),%0 \n"
- "vpslld $0x18,%%ymm1,%%ymm1 \n"
- "vpslld $0x18,%%ymm2,%%ymm2 \n"
- "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n"
- "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
- "vmovdqu %%ymm1,(%1) \n"
- "vmovdqu %%ymm2,0x20(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "vpmovzxbd (%0),%%ymm1 \n"
+ "vpmovzxbd 0x8(%0),%%ymm2 \n"
+ "lea 0x10(%0),%0 \n"
+ "vpslld $0x18,%%ymm1,%%ymm1 \n"
+ "vpslld $0x18,%%ymm2,%%ymm2 \n"
+ "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n"
+ "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm1,(%1) \n"
+ "vmovdqu %%ymm2,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -4153,7 +4250,7 @@ void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
const uint32_t v32 = v8 * 0x01010101u; // Duplicate byte to all bytes.
asm volatile(
- "rep stosl \n"
+ "rep stosl \n"
: "+D"(dst), // %0
"+c"(width_tmp) // %1
: "a"(v32) // %2
@@ -4164,7 +4261,7 @@ void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
size_t width_tmp = (size_t)(width);
asm volatile(
- "rep stosb \n"
+ "rep stosb \n"
: "+D"(dst), // %0
"+c"(width_tmp) // %1
: "a"(v8) // %2
@@ -4175,7 +4272,7 @@ void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
size_t width_tmp = (size_t)(width);
asm volatile(
- "rep stosl \n"
+ "rep stosl \n"
: "+D"(dst_argb), // %0
"+c"(width_tmp) // %1
: "a"(v32) // %2
@@ -4186,21 +4283,21 @@ void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
#ifdef HAS_YUY2TOYROW_SSE2
void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -4214,32 +4311,32 @@ void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
uint8_t* dst_v,
int width) {
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x00(%0,%4,1),%%xmm2 \n"
- "movdqu 0x10(%0,%4,1),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -4253,28 +4350,28 @@ void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
uint8_t* dst_v,
int width) {
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -4288,16 +4385,16 @@ void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -4311,32 +4408,32 @@ void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
uint8_t* dst_v,
int width) {
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x00(%0,%4,1),%%xmm2 \n"
- "movdqu 0x10(%0,%4,1),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -4350,28 +4447,28 @@ void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
uint8_t* dst_v,
int width) {
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -4384,22 +4481,22 @@ void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
#ifdef HAS_YUY2TOYROW_AVX2
void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
asm volatile(
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_yuy2), // %0
"+r"(dst_y), // %1
@@ -4414,32 +4511,32 @@ void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
uint8_t* dst_v,
int width) {
asm volatile(
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
- "sub %1,%2 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
- "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm1 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm1,%%ymm1 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
+ "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm1,(%1) \n"
"vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
@@ -4454,30 +4551,30 @@ void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
uint8_t* dst_v,
int width) {
asm volatile(
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
- "sub %1,%2 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm1 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm1,%%ymm1 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm1,(%1) \n"
"vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
@@ -4492,17 +4589,17 @@ void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_uyvy), // %0
"+r"(dst_y), // %1
@@ -4516,32 +4613,32 @@ void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
uint8_t* dst_v,
int width) {
asm volatile(
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
- "sub %1,%2 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
- "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm1 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm1,%%ymm1 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
+ "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm1,(%1) \n"
"vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
@@ -4556,30 +4653,30 @@ void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
uint8_t* dst_v,
int width) {
asm volatile(
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
- "sub %1,%2 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm1 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm1,%%ymm1 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm1,(%1) \n"
"vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
@@ -4601,71 +4698,71 @@ void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
uint8_t* dst_argb,
int width) {
asm volatile(
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "psrlw $0xf,%%xmm7 \n"
- "pcmpeqb %%xmm6,%%xmm6 \n"
- "psrlw $0x8,%%xmm6 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psllw $0x8,%%xmm5 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "pslld $0x18,%%xmm4 \n"
- "sub $0x4,%3 \n"
- "jl 49f \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $0xf,%%xmm7 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrlw $0x8,%%xmm6 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psllw $0x8,%%xmm5 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "pslld $0x18,%%xmm4 \n"
+ "sub $0x4,%3 \n"
+ "jl 49f \n"
// 4 pixel loop.
LABELALIGN
"40: \n"
- "movdqu (%0),%%xmm3 \n"
- "lea 0x10(%0),%0 \n"
- "movdqa %%xmm3,%%xmm0 \n"
- "pxor %%xmm4,%%xmm3 \n"
- "movdqu (%1),%%xmm2 \n"
- "pshufb %4,%%xmm3 \n"
- "pand %%xmm6,%%xmm2 \n"
- "paddw %%xmm7,%%xmm3 \n"
- "pmullw %%xmm3,%%xmm2 \n"
- "movdqu (%1),%%xmm1 \n"
- "lea 0x10(%1),%1 \n"
- "psrlw $0x8,%%xmm1 \n"
- "por %%xmm4,%%xmm0 \n"
- "pmullw %%xmm3,%%xmm1 \n"
- "psrlw $0x8,%%xmm2 \n"
- "paddusb %%xmm2,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%3 \n"
- "jge 40b \n"
+ "movdqu (%0),%%xmm3 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movdqu (%1),%%xmm2 \n"
+ "pshufb %4,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "lea 0x10(%1),%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jge 40b \n"
"49: \n"
- "add $0x3,%3 \n"
- "jl 99f \n"
+ "add $0x3,%3 \n"
+ "jl 99f \n"
// 1 pixel loop.
"91: \n"
- "movd (%0),%%xmm3 \n"
- "lea 0x4(%0),%0 \n"
- "movdqa %%xmm3,%%xmm0 \n"
- "pxor %%xmm4,%%xmm3 \n"
- "movd (%1),%%xmm2 \n"
- "pshufb %4,%%xmm3 \n"
- "pand %%xmm6,%%xmm2 \n"
- "paddw %%xmm7,%%xmm3 \n"
- "pmullw %%xmm3,%%xmm2 \n"
- "movd (%1),%%xmm1 \n"
- "lea 0x4(%1),%1 \n"
- "psrlw $0x8,%%xmm1 \n"
- "por %%xmm4,%%xmm0 \n"
- "pmullw %%xmm3,%%xmm1 \n"
- "psrlw $0x8,%%xmm2 \n"
- "paddusb %%xmm2,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "movd %%xmm0,(%2) \n"
- "lea 0x4(%2),%2 \n"
- "sub $0x1,%3 \n"
- "jge 91b \n"
+ "movd (%0),%%xmm3 \n"
+ "lea 0x4(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movd (%1),%%xmm2 \n"
+ "pshufb %4,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movd (%1),%%xmm1 \n"
+ "lea 0x4(%1),%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movd %%xmm0,(%2) \n"
+ "lea 0x4(%2),%2 \n"
+ "sub $0x1,%3 \n"
+ "jge 91b \n"
"99: \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
@@ -4689,36 +4786,36 @@ void BlendPlaneRow_SSSE3(const uint8_t* src0,
uint8_t* dst,
int width) {
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psllw $0x8,%%xmm5 \n"
- "mov $0x80808080,%%eax \n"
- "movd %%eax,%%xmm6 \n"
- "pshufd $0x0,%%xmm6,%%xmm6 \n"
- "mov $0x807f807f,%%eax \n"
- "movd %%eax,%%xmm7 \n"
- "pshufd $0x0,%%xmm7,%%xmm7 \n"
- "sub %2,%0 \n"
- "sub %2,%1 \n"
- "sub %2,%3 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psllw $0x8,%%xmm5 \n"
+ "mov $0x80808080,%%eax \n"
+ "movd %%eax,%%xmm6 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "mov $0x807f807f,%%eax \n"
+ "movd %%eax,%%xmm7 \n"
+ "pshufd $0x0,%%xmm7,%%xmm7 \n"
+ "sub %2,%0 \n"
+ "sub %2,%1 \n"
+ "sub %2,%3 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movq (%2),%%xmm0 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "pxor %%xmm5,%%xmm0 \n"
- "movq (%0,%2,1),%%xmm1 \n"
- "movq (%1,%2,1),%%xmm2 \n"
- "punpcklbw %%xmm2,%%xmm1 \n"
- "psubb %%xmm6,%%xmm1 \n"
- "pmaddubsw %%xmm1,%%xmm0 \n"
- "paddw %%xmm7,%%xmm0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,(%3,%2,1) \n"
- "lea 0x8(%2),%2 \n"
- "sub $0x8,%4 \n"
- "jg 1b \n"
+ "movq (%2),%%xmm0 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "pxor %%xmm5,%%xmm0 \n"
+ "movq (%0,%2,1),%%xmm1 \n"
+ "movq (%1,%2,1),%%xmm2 \n"
+ "punpcklbw %%xmm2,%%xmm1 \n"
+ "psubb %%xmm6,%%xmm1 \n"
+ "pmaddubsw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm7,%%xmm0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%3,%2,1) \n"
+ "lea 0x8(%2),%2 \n"
+ "sub $0x8,%4 \n"
+ "jg 1b \n"
: "+r"(src0), // %0
"+r"(src1), // %1
"+r"(alpha), // %2
@@ -4741,43 +4838,43 @@ void BlendPlaneRow_AVX2(const uint8_t* src0,
uint8_t* dst,
int width) {
asm volatile(
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsllw $0x8,%%ymm5,%%ymm5 \n"
- "mov $0x80808080,%%eax \n"
- "vmovd %%eax,%%xmm6 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsllw $0x8,%%ymm5,%%ymm5 \n"
+ "mov $0x80808080,%%eax \n"
+ "vmovd %%eax,%%xmm6 \n"
"vbroadcastss %%xmm6,%%ymm6 \n"
- "mov $0x807f807f,%%eax \n"
- "vmovd %%eax,%%xmm7 \n"
+ "mov $0x807f807f,%%eax \n"
+ "vmovd %%eax,%%xmm7 \n"
"vbroadcastss %%xmm7,%%ymm7 \n"
- "sub %2,%0 \n"
- "sub %2,%1 \n"
- "sub %2,%3 \n"
+ "sub %2,%0 \n"
+ "sub %2,%1 \n"
+ "sub %2,%3 \n"
// 32 pixel loop.
LABELALIGN
"1: \n"
- "vmovdqu (%2),%%ymm0 \n"
- "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n"
- "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
- "vpxor %%ymm5,%%ymm3,%%ymm3 \n"
- "vpxor %%ymm5,%%ymm0,%%ymm0 \n"
- "vmovdqu (%0,%2,1),%%ymm1 \n"
- "vmovdqu (%1,%2,1),%%ymm2 \n"
- "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n"
- "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
- "vpsubb %%ymm6,%%ymm4,%%ymm4 \n"
- "vpsubb %%ymm6,%%ymm1,%%ymm1 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm7,%%ymm3,%%ymm3 \n"
- "vpaddw %%ymm7,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm3,%%ymm3 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%3,%2,1) \n"
- "lea 0x20(%2),%2 \n"
- "sub $0x20,%4 \n"
- "jg 1b \n"
+ "vmovdqu (%2),%%ymm0 \n"
+ "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n"
+ "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpxor %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpxor %%ymm5,%%ymm0,%%ymm0 \n"
+ "vmovdqu (%0,%2,1),%%ymm1 \n"
+ "vmovdqu (%1,%2,1),%%ymm2 \n"
+ "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n"
+ "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
+ "vpsubb %%ymm6,%%ymm4,%%ymm4 \n"
+ "vpsubb %%ymm6,%%ymm1,%%ymm1 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm7,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm7,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm3,%%ymm3 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%3,%2,1) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x20,%4 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src0), // %0
"+r"(src1), // %1
@@ -4791,7 +4888,7 @@ void BlendPlaneRow_AVX2(const uint8_t* src0,
#endif // HAS_BLENDPLANEROW_AVX2
#ifdef HAS_ARGBATTENUATEROW_SSSE3
-// Shuffle table duplicating alpha
+// Shuffle table duplicating alpha.
static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,
7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u};
static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
@@ -4801,35 +4898,35 @@ void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
uint8_t* dst_argb,
int width) {
asm volatile(
- "pcmpeqb %%xmm3,%%xmm3 \n"
- "pslld $0x18,%%xmm3 \n"
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "pslld $0x18,%%xmm3 \n"
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
// 4 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "movdqu (%0),%%xmm1 \n"
- "punpcklbw %%xmm1,%%xmm1 \n"
- "pmulhuw %%xmm1,%%xmm0 \n"
- "movdqu (%0),%%xmm1 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "movdqu (%0),%%xmm2 \n"
- "punpckhbw %%xmm2,%%xmm2 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "movdqu (%0),%%xmm2 \n"
- "lea 0x10(%0),%0 \n"
- "pand %%xmm3,%%xmm2 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqu (%0),%%xmm1 \n"
+ "punpcklbw %%xmm1,%%xmm1 \n"
+ "pmulhuw %%xmm1,%%xmm0 \n"
+ "movdqu (%0),%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "movdqu (%0),%%xmm2 \n"
+ "punpckhbw %%xmm2,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "movdqu (%0),%%xmm2 \n"
+ "lea 0x10(%0),%0 \n"
+ "pand %%xmm3,%%xmm2 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -4850,29 +4947,29 @@ void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm4 \n"
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpslld $0x18,%%ymm5,%%ymm5 \n"
- "sub %0,%1 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpslld $0x18,%%ymm5,%%ymm5 \n"
+ "sub %0,%1 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm6 \n"
- "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
- "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
- "vpshufb %%ymm4,%%ymm0,%%ymm2 \n"
- "vpshufb %%ymm4,%%ymm1,%%ymm3 \n"
- "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
- "vpand %%ymm5,%%ymm6,%%ymm6 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpor %%ymm6,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,0x00(%0,%1,1) \n"
- "lea 0x20(%0),%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm6 \n"
+ "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
+ "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
+ "vpshufb %%ymm4,%%ymm0,%%ymm2 \n"
+ "vpshufb %%ymm4,%%ymm1,%%ymm3 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpand %%ymm5,%%ymm6,%%ymm6 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpor %%ymm6,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,0x00(%0,%1,1) \n"
+ "lea 0x20(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
@@ -4892,32 +4989,32 @@ void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
// 4 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movzb 0x03(%0),%3 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "movd 0x00(%4,%3,4),%%xmm2 \n"
- "movzb 0x07(%0),%3 \n"
- "movd 0x00(%4,%3,4),%%xmm3 \n"
- "pshuflw $0x40,%%xmm2,%%xmm2 \n"
- "pshuflw $0x40,%%xmm3,%%xmm3 \n"
- "movlhps %%xmm3,%%xmm2 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "movdqu (%0),%%xmm1 \n"
- "movzb 0x0b(%0),%3 \n"
- "punpckhbw %%xmm1,%%xmm1 \n"
- "movd 0x00(%4,%3,4),%%xmm2 \n"
- "movzb 0x0f(%0),%3 \n"
- "movd 0x00(%4,%3,4),%%xmm3 \n"
- "pshuflw $0x40,%%xmm2,%%xmm2 \n"
- "pshuflw $0x40,%%xmm3,%%xmm3 \n"
- "movlhps %%xmm3,%%xmm2 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movzb 0x03(%0),%3 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "movd 0x00(%4,%3,4),%%xmm2 \n"
+ "movzb 0x07(%0),%3 \n"
+ "movd 0x00(%4,%3,4),%%xmm3 \n"
+ "pshuflw $0x40,%%xmm2,%%xmm2 \n"
+ "pshuflw $0x40,%%xmm3,%%xmm3 \n"
+ "movlhps %%xmm3,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "movdqu (%0),%%xmm1 \n"
+ "movzb 0x0b(%0),%3 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "movd 0x00(%4,%3,4),%%xmm2 \n"
+ "movzb 0x0f(%0),%3 \n"
+ "movd 0x00(%4,%3,4),%%xmm3 \n"
+ "pshuflw $0x40,%%xmm2,%%xmm2 \n"
+ "pshuflw $0x40,%%xmm3,%%xmm3 \n"
+ "movlhps %%xmm3,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width), // %2
@@ -4937,52 +5034,52 @@ void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
int width) {
uintptr_t alpha;
asm volatile(
- "sub %0,%1 \n"
+ "sub %0,%1 \n"
"vbroadcastf128 %5,%%ymm5 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
// replace VPGATHER
- "movzb 0x03(%0),%3 \n"
- "vmovd 0x00(%4,%3,4),%%xmm0 \n"
- "movzb 0x07(%0),%3 \n"
- "vmovd 0x00(%4,%3,4),%%xmm1 \n"
- "movzb 0x0b(%0),%3 \n"
- "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n"
- "vmovd 0x00(%4,%3,4),%%xmm2 \n"
- "movzb 0x0f(%0),%3 \n"
- "vmovd 0x00(%4,%3,4),%%xmm3 \n"
- "movzb 0x13(%0),%3 \n"
- "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n"
- "vmovd 0x00(%4,%3,4),%%xmm0 \n"
- "movzb 0x17(%0),%3 \n"
- "vmovd 0x00(%4,%3,4),%%xmm1 \n"
- "movzb 0x1b(%0),%3 \n"
- "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n"
- "vmovd 0x00(%4,%3,4),%%xmm2 \n"
- "movzb 0x1f(%0),%3 \n"
- "vmovd 0x00(%4,%3,4),%%xmm3 \n"
- "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n"
+ "movzb 0x03(%0),%3 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm0 \n"
+ "movzb 0x07(%0),%3 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm1 \n"
+ "movzb 0x0b(%0),%3 \n"
+ "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm2 \n"
+ "movzb 0x0f(%0),%3 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm3 \n"
+ "movzb 0x13(%0),%3 \n"
+ "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm0 \n"
+ "movzb 0x17(%0),%3 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm1 \n"
+ "movzb 0x1b(%0),%3 \n"
+ "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm2 \n"
+ "movzb 0x1f(%0),%3 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm3 \n"
+ "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n"
"vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n"
"vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n"
"vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n"
// end of VPGATHER
- "vmovdqu (%0),%%ymm6 \n"
- "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
- "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
- "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n"
- "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n"
- "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
- "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
- "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,0x00(%0,%1,1) \n"
- "lea 0x20(%0),%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm6 \n"
+ "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
+ "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
+ "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n"
+ "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n"
+ "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
+ "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,0x00(%0,%1,1) \n"
+ "lea 0x20(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
@@ -4999,42 +5096,42 @@ void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
asm volatile(
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "psubb %%xmm5,%%xmm0 \n"
- "psubb %%xmm5,%%xmm1 \n"
- "movdqu %%xmm4,%%xmm6 \n"
- "pmaddubsw %%xmm0,%%xmm6 \n"
- "movdqu %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm0,%%xmm6 \n"
- "paddw %%xmm5,%%xmm6 \n"
- "psrlw $0x8,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movdqu (%0),%%xmm2 \n"
- "movdqu 0x10(%0),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "psrld $0x18,%%xmm2 \n"
- "psrld $0x18,%%xmm3 \n"
- "packuswb %%xmm3,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm2 \n"
- "movdqa %%xmm6,%%xmm3 \n"
- "punpcklbw %%xmm6,%%xmm6 \n"
- "punpcklbw %%xmm2,%%xmm3 \n"
- "movdqa %%xmm6,%%xmm1 \n"
- "punpcklwd %%xmm3,%%xmm6 \n"
- "punpckhwd %%xmm3,%%xmm1 \n"
- "movdqu %%xmm6,(%1) \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "psubb %%xmm5,%%xmm0 \n"
+ "psubb %%xmm5,%%xmm1 \n"
+ "movdqu %%xmm4,%%xmm6 \n"
+ "pmaddubsw %%xmm0,%%xmm6 \n"
+ "movdqu %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm0,%%xmm6 \n"
+ "paddw %%xmm5,%%xmm6 \n"
+ "psrlw $0x8,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movdqu (%0),%%xmm2 \n"
+ "movdqu 0x10(%0),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "psrld $0x18,%%xmm2 \n"
+ "psrld $0x18,%%xmm3 \n"
+ "packuswb %%xmm3,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm2 \n"
+ "movdqa %%xmm6,%%xmm3 \n"
+ "punpcklbw %%xmm6,%%xmm6 \n"
+ "punpcklbw %%xmm2,%%xmm3 \n"
+ "movdqa %%xmm6,%%xmm1 \n"
+ "punpcklwd %%xmm3,%%xmm6 \n"
+ "punpckhwd %%xmm3,%%xmm1 \n"
+ "movdqu %%xmm6,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -5061,50 +5158,50 @@ static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
asm volatile(
- "movdqa %2,%%xmm2 \n"
- "movdqa %3,%%xmm3 \n"
- "movdqa %4,%%xmm4 \n"
+ "movdqa %2,%%xmm2 \n"
+ "movdqa %3,%%xmm3 \n"
+ "movdqa %4,%%xmm4 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm6 \n"
- "pmaddubsw %%xmm2,%%xmm0 \n"
- "pmaddubsw %%xmm2,%%xmm6 \n"
- "phaddw %%xmm6,%%xmm0 \n"
- "psrlw $0x7,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movdqu (%0),%%xmm5 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm5 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "phaddw %%xmm1,%%xmm5 \n"
- "psrlw $0x7,%%xmm5 \n"
- "packuswb %%xmm5,%%xmm5 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "movdqu (%0),%%xmm5 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm5 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "phaddw %%xmm1,%%xmm5 \n"
- "psrlw $0x7,%%xmm5 \n"
- "packuswb %%xmm5,%%xmm5 \n"
- "movdqu (%0),%%xmm6 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "psrld $0x18,%%xmm6 \n"
- "psrld $0x18,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "punpcklbw %%xmm6,%%xmm5 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm5,%%xmm0 \n"
- "punpckhwd %%xmm5,%%xmm1 \n"
- "movdqu %%xmm0,(%0) \n"
- "movdqu %%xmm1,0x10(%0) \n"
- "lea 0x20(%0),%0 \n"
- "sub $0x8,%1 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm6 \n"
+ "pmaddubsw %%xmm2,%%xmm0 \n"
+ "pmaddubsw %%xmm2,%%xmm6 \n"
+ "phaddw %%xmm6,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movdqu (%0),%%xmm5 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm5 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "phaddw %%xmm1,%%xmm5 \n"
+ "psrlw $0x7,%%xmm5 \n"
+ "packuswb %%xmm5,%%xmm5 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "movdqu (%0),%%xmm5 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm5 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "phaddw %%xmm1,%%xmm5 \n"
+ "psrlw $0x7,%%xmm5 \n"
+ "packuswb %%xmm5,%%xmm5 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "psrld $0x18,%%xmm6 \n"
+ "psrld $0x18,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "punpcklbw %%xmm6,%%xmm5 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm5,%%xmm0 \n"
+ "punpckhwd %%xmm5,%%xmm1 \n"
+ "movdqu %%xmm0,(%0) \n"
+ "movdqu %%xmm1,0x10(%0) \n"
+ "lea 0x20(%0),%0 \n"
+ "sub $0x8,%1 \n"
+ "jg 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
: "m"(kARGBToSepiaB), // %2
@@ -5122,54 +5219,54 @@ void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
const int8_t* matrix_argb,
int width) {
asm volatile(
- "movdqu (%3),%%xmm5 \n"
- "pshufd $0x00,%%xmm5,%%xmm2 \n"
- "pshufd $0x55,%%xmm5,%%xmm3 \n"
- "pshufd $0xaa,%%xmm5,%%xmm4 \n"
- "pshufd $0xff,%%xmm5,%%xmm5 \n"
+ "movdqu (%3),%%xmm5 \n"
+ "pshufd $0x00,%%xmm5,%%xmm2 \n"
+ "pshufd $0x55,%%xmm5,%%xmm3 \n"
+ "pshufd $0xaa,%%xmm5,%%xmm4 \n"
+ "pshufd $0xff,%%xmm5,%%xmm5 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm7 \n"
- "pmaddubsw %%xmm2,%%xmm0 \n"
- "pmaddubsw %%xmm2,%%xmm7 \n"
- "movdqu (%0),%%xmm6 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "phaddsw %%xmm7,%%xmm0 \n"
- "phaddsw %%xmm1,%%xmm6 \n"
- "psraw $0x6,%%xmm0 \n"
- "psraw $0x6,%%xmm6 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "punpcklbw %%xmm6,%%xmm0 \n"
- "movdqu (%0),%%xmm1 \n"
- "movdqu 0x10(%0),%%xmm7 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm7 \n"
- "phaddsw %%xmm7,%%xmm1 \n"
- "movdqu (%0),%%xmm6 \n"
- "movdqu 0x10(%0),%%xmm7 \n"
- "pmaddubsw %%xmm5,%%xmm6 \n"
- "pmaddubsw %%xmm5,%%xmm7 \n"
- "phaddsw %%xmm7,%%xmm6 \n"
- "psraw $0x6,%%xmm1 \n"
- "psraw $0x6,%%xmm6 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "punpcklbw %%xmm6,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm6 \n"
- "punpcklwd %%xmm1,%%xmm0 \n"
- "punpckhwd %%xmm1,%%xmm6 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm6,0x10(%1) \n"
- "lea 0x20(%0),%0 \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm7 \n"
+ "pmaddubsw %%xmm2,%%xmm0 \n"
+ "pmaddubsw %%xmm2,%%xmm7 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "phaddsw %%xmm7,%%xmm0 \n"
+ "phaddsw %%xmm1,%%xmm6 \n"
+ "psraw $0x6,%%xmm0 \n"
+ "psraw $0x6,%%xmm6 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "punpcklbw %%xmm6,%%xmm0 \n"
+ "movdqu (%0),%%xmm1 \n"
+ "movdqu 0x10(%0),%%xmm7 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm7 \n"
+ "phaddsw %%xmm7,%%xmm1 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu 0x10(%0),%%xmm7 \n"
+ "pmaddubsw %%xmm5,%%xmm6 \n"
+ "pmaddubsw %%xmm5,%%xmm7 \n"
+ "phaddsw %%xmm7,%%xmm6 \n"
+ "psraw $0x6,%%xmm1 \n"
+ "psraw $0x6,%%xmm6 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "punpcklbw %%xmm6,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm6 \n"
+ "punpcklwd %%xmm1,%%xmm0 \n"
+ "punpckhwd %%xmm1,%%xmm6 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm6,0x10(%1) \n"
+ "lea 0x20(%0),%0 \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -5187,40 +5284,40 @@ void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
int interval_offset,
int width) {
asm volatile(
- "movd %2,%%xmm2 \n"
- "movd %3,%%xmm3 \n"
- "movd %4,%%xmm4 \n"
- "pshuflw $0x40,%%xmm2,%%xmm2 \n"
- "pshufd $0x44,%%xmm2,%%xmm2 \n"
- "pshuflw $0x40,%%xmm3,%%xmm3 \n"
- "pshufd $0x44,%%xmm3,%%xmm3 \n"
- "pshuflw $0x40,%%xmm4,%%xmm4 \n"
- "pshufd $0x44,%%xmm4,%%xmm4 \n"
- "pxor %%xmm5,%%xmm5 \n"
- "pcmpeqb %%xmm6,%%xmm6 \n"
- "pslld $0x18,%%xmm6 \n"
+ "movd %2,%%xmm2 \n"
+ "movd %3,%%xmm3 \n"
+ "movd %4,%%xmm4 \n"
+ "pshuflw $0x40,%%xmm2,%%xmm2 \n"
+ "pshufd $0x44,%%xmm2,%%xmm2 \n"
+ "pshuflw $0x40,%%xmm3,%%xmm3 \n"
+ "pshufd $0x44,%%xmm3,%%xmm3 \n"
+ "pshuflw $0x40,%%xmm4,%%xmm4 \n"
+ "pshufd $0x44,%%xmm4,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "pslld $0x18,%%xmm6 \n"
// 4 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "movdqu (%0),%%xmm1 \n"
- "punpckhbw %%xmm5,%%xmm1 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "pmullw %%xmm3,%%xmm0 \n"
- "movdqu (%0),%%xmm7 \n"
- "pmullw %%xmm3,%%xmm1 \n"
- "pand %%xmm6,%%xmm7 \n"
- "paddw %%xmm4,%%xmm0 \n"
- "paddw %%xmm4,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "por %%xmm7,%%xmm0 \n"
- "movdqu %%xmm0,(%0) \n"
- "lea 0x10(%0),%0 \n"
- "sub $0x4,%1 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "movdqu (%0),%%xmm1 \n"
+ "punpckhbw %%xmm5,%%xmm1 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "pmullw %%xmm3,%%xmm0 \n"
+ "movdqu (%0),%%xmm7 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "pand %%xmm6,%%xmm7 \n"
+ "paddw %%xmm4,%%xmm0 \n"
+ "paddw %%xmm4,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "por %%xmm7,%%xmm0 \n"
+ "movdqu %%xmm0,(%0) \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x4,%1 \n"
+ "jg 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
: "r"(scale), // %2
@@ -5238,27 +5335,27 @@ void ARGBShadeRow_SSE2(const uint8_t* src_argb,
int width,
uint32_t value) {
asm volatile(
- "movd %3,%%xmm2 \n"
- "punpcklbw %%xmm2,%%xmm2 \n"
- "punpcklqdq %%xmm2,%%xmm2 \n"
+ "movd %3,%%xmm2 \n"
+ "punpcklbw %%xmm2,%%xmm2 \n"
+ "punpcklqdq %%xmm2,%%xmm2 \n"
// 4 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm1 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -5275,28 +5372,28 @@ void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
int width) {
asm volatile(
- "pxor %%xmm5,%%xmm5 \n"
+ "pxor %%xmm5,%%xmm5 \n"
// 4 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movdqu (%1),%%xmm2 \n"
- "lea 0x10(%1),%1 \n"
- "movdqu %%xmm0,%%xmm1 \n"
- "movdqu %%xmm2,%%xmm3 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "punpckhbw %%xmm5,%%xmm3 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "pmulhuw %%xmm3,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqu (%1),%%xmm2 \n"
+ "lea 0x10(%1),%1 \n"
+ "movdqu %%xmm0,%%xmm1 \n"
+ "movdqu %%xmm2,%%xmm3 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpckhbw %%xmm5,%%xmm3 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "pmulhuw %%xmm3,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
@@ -5314,26 +5411,26 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
int width) {
asm volatile(
- "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
// 4 pixel loop.
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm1 \n"
- "lea 0x20(%0),%0 \n"
- "vmovdqu (%1),%%ymm3 \n"
- "lea 0x20(%1),%1 \n"
- "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n"
- "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n"
- "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
- "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
- "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%2) \n"
- "lea 0x20(%2),%2 \n"
- "sub $0x8,%3 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "vmovdqu (%1),%%ymm3 \n"
+ "lea 0x20(%1),%1 \n"
+ "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n"
+ "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
+ "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%2) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x8,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
@@ -5359,15 +5456,15 @@ void ARGBAddRow_SSE2(const uint8_t* src_argb0,
// 4 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movdqu (%1),%%xmm1 \n"
- "lea 0x10(%1),%1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "lea 0x10(%1),%1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
@@ -5387,14 +5484,14 @@ void ARGBAddRow_AVX2(const uint8_t* src_argb0,
// 4 pixel loop.
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "lea 0x20(%0),%0 \n"
- "vpaddusb (%1),%%ymm0,%%ymm0 \n"
- "lea 0x20(%1),%1 \n"
- "vmovdqu %%ymm0,(%2) \n"
- "lea 0x20(%2),%2 \n"
- "sub $0x8,%3 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "lea 0x20(%0),%0 \n"
+ "vpaddusb (%1),%%ymm0,%%ymm0 \n"
+ "lea 0x20(%1),%1 \n"
+ "vmovdqu %%ymm0,(%2) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x8,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
@@ -5415,15 +5512,15 @@ void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
// 4 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movdqu (%1),%%xmm1 \n"
- "lea 0x10(%1),%1 \n"
- "psubusb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "lea 0x10(%1),%1 \n"
+ "psubusb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
@@ -5443,14 +5540,14 @@ void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
// 4 pixel loop.
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "lea 0x20(%0),%0 \n"
- "vpsubusb (%1),%%ymm0,%%ymm0 \n"
- "lea 0x20(%1),%1 \n"
- "vmovdqu %%ymm0,(%2) \n"
- "lea 0x20(%2),%2 \n"
- "sub $0x8,%3 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "lea 0x20(%0),%0 \n"
+ "vpsubusb (%1),%%ymm0,%%ymm0 \n"
+ "lea 0x20(%1),%1 \n"
+ "vmovdqu %%ymm0,(%2) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x8,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
@@ -5472,40 +5569,40 @@ void SobelXRow_SSE2(const uint8_t* src_y0,
uint8_t* dst_sobelx,
int width) {
asm volatile(
- "sub %0,%1 \n"
- "sub %0,%2 \n"
- "sub %0,%3 \n"
- "pxor %%xmm5,%%xmm5 \n"
+ "sub %0,%1 \n"
+ "sub %0,%2 \n"
+ "sub %0,%3 \n"
+ "pxor %%xmm5,%%xmm5 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movq (%0),%%xmm0 \n"
- "movq 0x2(%0),%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "psubw %%xmm1,%%xmm0 \n"
- "movq 0x00(%0,%1,1),%%xmm1 \n"
- "movq 0x02(%0,%1,1),%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "psubw %%xmm2,%%xmm1 \n"
- "movq 0x00(%0,%2,1),%%xmm2 \n"
- "movq 0x02(%0,%2,1),%%xmm3 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm3 \n"
- "psubw %%xmm3,%%xmm2 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm1,%%xmm0 \n"
- "paddw %%xmm1,%%xmm0 \n"
- "pxor %%xmm1,%%xmm1 \n"
- "psubw %%xmm0,%%xmm1 \n"
- "pmaxsw %%xmm1,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,0x00(%0,%3,1) \n"
- "lea 0x8(%0),%0 \n"
- "sub $0x8,%4 \n"
- "jg 1b \n"
+ "movq (%0),%%xmm0 \n"
+ "movq 0x2(%0),%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "psubw %%xmm1,%%xmm0 \n"
+ "movq 0x00(%0,%1,1),%%xmm1 \n"
+ "movq 0x02(%0,%1,1),%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "psubw %%xmm2,%%xmm1 \n"
+ "movq 0x00(%0,%2,1),%%xmm2 \n"
+ "movq 0x02(%0,%2,1),%%xmm3 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm3 \n"
+ "psubw %%xmm3,%%xmm2 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "pxor %%xmm1,%%xmm1 \n"
+ "psubw %%xmm0,%%xmm1 \n"
+ "pmaxsw %%xmm1,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,0x00(%0,%3,1) \n"
+ "lea 0x8(%0),%0 \n"
+ "sub $0x8,%4 \n"
+ "jg 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
"+r"(src_y2), // %2
@@ -5526,39 +5623,39 @@ void SobelYRow_SSE2(const uint8_t* src_y0,
uint8_t* dst_sobely,
int width) {
asm volatile(
- "sub %0,%1 \n"
- "sub %0,%2 \n"
- "pxor %%xmm5,%%xmm5 \n"
+ "sub %0,%1 \n"
+ "sub %0,%2 \n"
+ "pxor %%xmm5,%%xmm5 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movq (%0),%%xmm0 \n"
- "movq 0x00(%0,%1,1),%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "psubw %%xmm1,%%xmm0 \n"
- "movq 0x1(%0),%%xmm1 \n"
- "movq 0x01(%0,%1,1),%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "psubw %%xmm2,%%xmm1 \n"
- "movq 0x2(%0),%%xmm2 \n"
- "movq 0x02(%0,%1,1),%%xmm3 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm3 \n"
- "psubw %%xmm3,%%xmm2 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm1,%%xmm0 \n"
- "paddw %%xmm1,%%xmm0 \n"
- "pxor %%xmm1,%%xmm1 \n"
- "psubw %%xmm0,%%xmm1 \n"
- "pmaxsw %%xmm1,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,0x00(%0,%2,1) \n"
- "lea 0x8(%0),%0 \n"
- "sub $0x8,%3 \n"
- "jg 1b \n"
+ "movq (%0),%%xmm0 \n"
+ "movq 0x00(%0,%1,1),%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "psubw %%xmm1,%%xmm0 \n"
+ "movq 0x1(%0),%%xmm1 \n"
+ "movq 0x01(%0,%1,1),%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "psubw %%xmm2,%%xmm1 \n"
+ "movq 0x2(%0),%%xmm2 \n"
+ "movq 0x02(%0,%1,1),%%xmm3 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm3 \n"
+ "psubw %%xmm3,%%xmm2 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "pxor %%xmm1,%%xmm1 \n"
+ "psubw %%xmm0,%%xmm1 \n"
+ "pmaxsw %%xmm1,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,0x00(%0,%2,1) \n"
+ "lea 0x8(%0),%0 \n"
+ "sub $0x8,%3 \n"
+ "jg 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
"+r"(dst_sobely), // %2
@@ -5579,37 +5676,37 @@ void SobelRow_SSE2(const uint8_t* src_sobelx,
uint8_t* dst_argb,
int width) {
asm volatile(
- "sub %0,%1 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pslld $0x18,%%xmm5 \n"
+ "sub %0,%1 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0x18,%%xmm5 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%1,1),%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "punpcklbw %%xmm0,%%xmm2 \n"
- "punpckhbw %%xmm0,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm1 \n"
- "punpckhwd %%xmm2,%%xmm2 \n"
- "por %%xmm5,%%xmm1 \n"
- "por %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm3 \n"
- "punpcklwd %%xmm0,%%xmm3 \n"
- "punpckhwd %%xmm0,%%xmm0 \n"
- "por %%xmm5,%%xmm3 \n"
- "por %%xmm5,%%xmm0 \n"
- "movdqu %%xmm1,(%2) \n"
- "movdqu %%xmm2,0x10(%2) \n"
- "movdqu %%xmm3,0x20(%2) \n"
- "movdqu %%xmm0,0x30(%2) \n"
- "lea 0x40(%2),%2 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%1,1),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpcklbw %%xmm0,%%xmm2 \n"
+ "punpckhbw %%xmm0,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm1 \n"
+ "punpckhwd %%xmm2,%%xmm2 \n"
+ "por %%xmm5,%%xmm1 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm3 \n"
+ "punpcklwd %%xmm0,%%xmm3 \n"
+ "punpckhwd %%xmm0,%%xmm0 \n"
+ "por %%xmm5,%%xmm3 \n"
+ "por %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm1,(%2) \n"
+ "movdqu %%xmm2,0x10(%2) \n"
+ "movdqu %%xmm3,0x20(%2) \n"
+ "movdqu %%xmm0,0x30(%2) \n"
+ "lea 0x40(%2),%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_argb), // %2
@@ -5626,21 +5723,21 @@ void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
uint8_t* dst_y,
int width) {
asm volatile(
- "sub %0,%1 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pslld $0x18,%%xmm5 \n"
+ "sub %0,%1 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0x18,%%xmm5 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%1,1),%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%1,1),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_y), // %2
@@ -5661,36 +5758,36 @@ void SobelXYRow_SSE2(const uint8_t* src_sobelx,
uint8_t* dst_argb,
int width) {
asm volatile(
- "sub %0,%1 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
+ "sub %0,%1 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%1,1),%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "paddusb %%xmm1,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm3 \n"
- "punpcklbw %%xmm5,%%xmm3 \n"
- "punpckhbw %%xmm5,%%xmm0 \n"
- "movdqa %%xmm1,%%xmm4 \n"
- "punpcklbw %%xmm2,%%xmm4 \n"
- "punpckhbw %%xmm2,%%xmm1 \n"
- "movdqa %%xmm4,%%xmm6 \n"
- "punpcklwd %%xmm3,%%xmm6 \n"
- "punpckhwd %%xmm3,%%xmm4 \n"
- "movdqa %%xmm1,%%xmm7 \n"
- "punpcklwd %%xmm0,%%xmm7 \n"
- "punpckhwd %%xmm0,%%xmm1 \n"
- "movdqu %%xmm6,(%2) \n"
- "movdqu %%xmm4,0x10(%2) \n"
- "movdqu %%xmm7,0x20(%2) \n"
- "movdqu %%xmm1,0x30(%2) \n"
- "lea 0x40(%2),%2 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%1,1),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "paddusb %%xmm1,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm3 \n"
+ "punpcklbw %%xmm5,%%xmm3 \n"
+ "punpckhbw %%xmm5,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm4 \n"
+ "punpcklbw %%xmm2,%%xmm4 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "punpcklwd %%xmm3,%%xmm6 \n"
+ "punpckhwd %%xmm3,%%xmm4 \n"
+ "movdqa %%xmm1,%%xmm7 \n"
+ "punpcklwd %%xmm0,%%xmm7 \n"
+ "punpckhwd %%xmm0,%%xmm1 \n"
+ "movdqu %%xmm6,(%2) \n"
+ "movdqu %%xmm4,0x10(%2) \n"
+ "movdqu %%xmm7,0x20(%2) \n"
+ "movdqu %%xmm1,0x30(%2) \n"
+ "lea 0x40(%2),%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_argb), // %2
@@ -5709,67 +5806,67 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
const int32_t* previous_cumsum,
int width) {
asm volatile(
- "pxor %%xmm0,%%xmm0 \n"
- "pxor %%xmm1,%%xmm1 \n"
- "sub $0x4,%3 \n"
- "jl 49f \n"
- "test $0xf,%1 \n"
- "jne 49f \n"
+ "pxor %%xmm0,%%xmm0 \n"
+ "pxor %%xmm1,%%xmm1 \n"
+ "sub $0x4,%3 \n"
+ "jl 49f \n"
+ "test $0xf,%1 \n"
+ "jne 49f \n"
// 4 pixel loop.
LABELALIGN
"40: \n"
- "movdqu (%0),%%xmm2 \n"
- "lea 0x10(%0),%0 \n"
- "movdqa %%xmm2,%%xmm4 \n"
- "punpcklbw %%xmm1,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "punpcklwd %%xmm1,%%xmm2 \n"
- "punpckhwd %%xmm1,%%xmm3 \n"
- "punpckhbw %%xmm1,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "punpcklwd %%xmm1,%%xmm4 \n"
- "punpckhwd %%xmm1,%%xmm5 \n"
- "paddd %%xmm2,%%xmm0 \n"
- "movdqu (%2),%%xmm2 \n"
- "paddd %%xmm0,%%xmm2 \n"
- "paddd %%xmm3,%%xmm0 \n"
- "movdqu 0x10(%2),%%xmm3 \n"
- "paddd %%xmm0,%%xmm3 \n"
- "paddd %%xmm4,%%xmm0 \n"
- "movdqu 0x20(%2),%%xmm4 \n"
- "paddd %%xmm0,%%xmm4 \n"
- "paddd %%xmm5,%%xmm0 \n"
- "movdqu 0x30(%2),%%xmm5 \n"
- "lea 0x40(%2),%2 \n"
- "paddd %%xmm0,%%xmm5 \n"
- "movdqu %%xmm2,(%1) \n"
- "movdqu %%xmm3,0x10(%1) \n"
- "movdqu %%xmm4,0x20(%1) \n"
- "movdqu %%xmm5,0x30(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x4,%3 \n"
- "jge 40b \n"
+ "movdqu (%0),%%xmm2 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm2,%%xmm4 \n"
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "punpcklwd %%xmm1,%%xmm2 \n"
+ "punpckhwd %%xmm1,%%xmm3 \n"
+ "punpckhbw %%xmm1,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "punpcklwd %%xmm1,%%xmm4 \n"
+ "punpckhwd %%xmm1,%%xmm5 \n"
+ "paddd %%xmm2,%%xmm0 \n"
+ "movdqu (%2),%%xmm2 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "paddd %%xmm3,%%xmm0 \n"
+ "movdqu 0x10(%2),%%xmm3 \n"
+ "paddd %%xmm0,%%xmm3 \n"
+ "paddd %%xmm4,%%xmm0 \n"
+ "movdqu 0x20(%2),%%xmm4 \n"
+ "paddd %%xmm0,%%xmm4 \n"
+ "paddd %%xmm5,%%xmm0 \n"
+ "movdqu 0x30(%2),%%xmm5 \n"
+ "lea 0x40(%2),%2 \n"
+ "paddd %%xmm0,%%xmm5 \n"
+ "movdqu %%xmm2,(%1) \n"
+ "movdqu %%xmm3,0x10(%1) \n"
+ "movdqu %%xmm4,0x20(%1) \n"
+ "movdqu %%xmm5,0x30(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x4,%3 \n"
+ "jge 40b \n"
"49: \n"
- "add $0x3,%3 \n"
- "jl 19f \n"
+ "add $0x3,%3 \n"
+ "jl 19f \n"
// 1 pixel loop.
LABELALIGN
"10: \n"
- "movd (%0),%%xmm2 \n"
- "lea 0x4(%0),%0 \n"
- "punpcklbw %%xmm1,%%xmm2 \n"
- "punpcklwd %%xmm1,%%xmm2 \n"
- "paddd %%xmm2,%%xmm0 \n"
- "movdqu (%2),%%xmm2 \n"
- "lea 0x10(%2),%2 \n"
- "paddd %%xmm0,%%xmm2 \n"
- "movdqu %%xmm2,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x1,%3 \n"
- "jge 10b \n"
+ "movd (%0),%%xmm2 \n"
+ "lea 0x4(%0),%0 \n"
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "punpcklwd %%xmm1,%%xmm2 \n"
+ "paddd %%xmm2,%%xmm0 \n"
+ "movdqu (%2),%%xmm2 \n"
+ "lea 0x10(%2),%2 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "movdqu %%xmm2,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x1,%3 \n"
+ "jge 10b \n"
"19: \n"
: "+r"(row), // %0
@@ -5789,119 +5886,119 @@ void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
uint8_t* dst,
int count) {
asm volatile(
- "movd %5,%%xmm5 \n"
- "cvtdq2ps %%xmm5,%%xmm5 \n"
- "rcpss %%xmm5,%%xmm4 \n"
- "pshufd $0x0,%%xmm4,%%xmm4 \n"
- "sub $0x4,%3 \n"
- "jl 49f \n"
- "cmpl $0x80,%5 \n"
- "ja 40f \n"
+ "movd %5,%%xmm5 \n"
+ "cvtdq2ps %%xmm5,%%xmm5 \n"
+ "rcpss %%xmm5,%%xmm4 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "sub $0x4,%3 \n"
+ "jl 49f \n"
+ "cmpl $0x80,%5 \n"
+ "ja 40f \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "pcmpeqb %%xmm6,%%xmm6 \n"
- "psrld $0x10,%%xmm6 \n"
- "cvtdq2ps %%xmm6,%%xmm6 \n"
- "addps %%xmm6,%%xmm5 \n"
- "mulps %%xmm4,%%xmm5 \n"
- "cvtps2dq %%xmm5,%%xmm5 \n"
- "packssdw %%xmm5,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrld $0x10,%%xmm6 \n"
+ "cvtdq2ps %%xmm6,%%xmm6 \n"
+ "addps %%xmm6,%%xmm5 \n"
+ "mulps %%xmm4,%%xmm5 \n"
+ "cvtps2dq %%xmm5,%%xmm5 \n"
+ "packssdw %%xmm5,%%xmm5 \n"
// 4 pixel small loop.
LABELALIGN
"4: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
- "psubd 0x00(%0,%4,4),%%xmm0 \n"
- "psubd 0x10(%0,%4,4),%%xmm1 \n"
- "psubd 0x20(%0,%4,4),%%xmm2 \n"
- "psubd 0x30(%0,%4,4),%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "psubd (%1),%%xmm0 \n"
- "psubd 0x10(%1),%%xmm1 \n"
- "psubd 0x20(%1),%%xmm2 \n"
- "psubd 0x30(%1),%%xmm3 \n"
- "paddd 0x00(%1,%4,4),%%xmm0 \n"
- "paddd 0x10(%1,%4,4),%%xmm1 \n"
- "paddd 0x20(%1,%4,4),%%xmm2 \n"
- "paddd 0x30(%1,%4,4),%%xmm3 \n"
- "lea 0x40(%1),%1 \n"
- "packssdw %%xmm1,%%xmm0 \n"
- "packssdw %%xmm3,%%xmm2 \n"
- "pmulhuw %%xmm5,%%xmm0 \n"
- "pmulhuw %%xmm5,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%3 \n"
- "jge 4b \n"
- "jmp 49f \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm3 \n"
+ "psubd 0x00(%0,%4,4),%%xmm0 \n"
+ "psubd 0x10(%0,%4,4),%%xmm1 \n"
+ "psubd 0x20(%0,%4,4),%%xmm2 \n"
+ "psubd 0x30(%0,%4,4),%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "psubd (%1),%%xmm0 \n"
+ "psubd 0x10(%1),%%xmm1 \n"
+ "psubd 0x20(%1),%%xmm2 \n"
+ "psubd 0x30(%1),%%xmm3 \n"
+ "paddd 0x00(%1,%4,4),%%xmm0 \n"
+ "paddd 0x10(%1,%4,4),%%xmm1 \n"
+ "paddd 0x20(%1,%4,4),%%xmm2 \n"
+ "paddd 0x30(%1,%4,4),%%xmm3 \n"
+ "lea 0x40(%1),%1 \n"
+ "packssdw %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm3,%%xmm2 \n"
+ "pmulhuw %%xmm5,%%xmm0 \n"
+ "pmulhuw %%xmm5,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jge 4b \n"
+ "jmp 49f \n"
// 4 pixel loop
LABELALIGN
"40: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
- "psubd 0x00(%0,%4,4),%%xmm0 \n"
- "psubd 0x10(%0,%4,4),%%xmm1 \n"
- "psubd 0x20(%0,%4,4),%%xmm2 \n"
- "psubd 0x30(%0,%4,4),%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "psubd (%1),%%xmm0 \n"
- "psubd 0x10(%1),%%xmm1 \n"
- "psubd 0x20(%1),%%xmm2 \n"
- "psubd 0x30(%1),%%xmm3 \n"
- "paddd 0x00(%1,%4,4),%%xmm0 \n"
- "paddd 0x10(%1,%4,4),%%xmm1 \n"
- "paddd 0x20(%1,%4,4),%%xmm2 \n"
- "paddd 0x30(%1,%4,4),%%xmm3 \n"
- "lea 0x40(%1),%1 \n"
- "cvtdq2ps %%xmm0,%%xmm0 \n"
- "cvtdq2ps %%xmm1,%%xmm1 \n"
- "mulps %%xmm4,%%xmm0 \n"
- "mulps %%xmm4,%%xmm1 \n"
- "cvtdq2ps %%xmm2,%%xmm2 \n"
- "cvtdq2ps %%xmm3,%%xmm3 \n"
- "mulps %%xmm4,%%xmm2 \n"
- "mulps %%xmm4,%%xmm3 \n"
- "cvtps2dq %%xmm0,%%xmm0 \n"
- "cvtps2dq %%xmm1,%%xmm1 \n"
- "cvtps2dq %%xmm2,%%xmm2 \n"
- "cvtps2dq %%xmm3,%%xmm3 \n"
- "packssdw %%xmm1,%%xmm0 \n"
- "packssdw %%xmm3,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%3 \n"
- "jge 40b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm3 \n"
+ "psubd 0x00(%0,%4,4),%%xmm0 \n"
+ "psubd 0x10(%0,%4,4),%%xmm1 \n"
+ "psubd 0x20(%0,%4,4),%%xmm2 \n"
+ "psubd 0x30(%0,%4,4),%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "psubd (%1),%%xmm0 \n"
+ "psubd 0x10(%1),%%xmm1 \n"
+ "psubd 0x20(%1),%%xmm2 \n"
+ "psubd 0x30(%1),%%xmm3 \n"
+ "paddd 0x00(%1,%4,4),%%xmm0 \n"
+ "paddd 0x10(%1,%4,4),%%xmm1 \n"
+ "paddd 0x20(%1,%4,4),%%xmm2 \n"
+ "paddd 0x30(%1,%4,4),%%xmm3 \n"
+ "lea 0x40(%1),%1 \n"
+ "cvtdq2ps %%xmm0,%%xmm0 \n"
+ "cvtdq2ps %%xmm1,%%xmm1 \n"
+ "mulps %%xmm4,%%xmm0 \n"
+ "mulps %%xmm4,%%xmm1 \n"
+ "cvtdq2ps %%xmm2,%%xmm2 \n"
+ "cvtdq2ps %%xmm3,%%xmm3 \n"
+ "mulps %%xmm4,%%xmm2 \n"
+ "mulps %%xmm4,%%xmm3 \n"
+ "cvtps2dq %%xmm0,%%xmm0 \n"
+ "cvtps2dq %%xmm1,%%xmm1 \n"
+ "cvtps2dq %%xmm2,%%xmm2 \n"
+ "cvtps2dq %%xmm3,%%xmm3 \n"
+ "packssdw %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm3,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jge 40b \n"
"49: \n"
- "add $0x3,%3 \n"
- "jl 19f \n"
+ "add $0x3,%3 \n"
+ "jl 19f \n"
// 1 pixel loop
LABELALIGN
"10: \n"
- "movdqu (%0),%%xmm0 \n"
- "psubd 0x00(%0,%4,4),%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "psubd (%1),%%xmm0 \n"
- "paddd 0x00(%1,%4,4),%%xmm0 \n"
- "lea 0x10(%1),%1 \n"
- "cvtdq2ps %%xmm0,%%xmm0 \n"
- "mulps %%xmm4,%%xmm0 \n"
- "cvtps2dq %%xmm0,%%xmm0 \n"
- "packssdw %%xmm0,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movd %%xmm0,(%2) \n"
- "lea 0x4(%2),%2 \n"
- "sub $0x1,%3 \n"
- "jge 10b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "psubd 0x00(%0,%4,4),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "psubd (%1),%%xmm0 \n"
+ "paddd 0x00(%1,%4,4),%%xmm0 \n"
+ "lea 0x10(%1),%1 \n"
+ "cvtdq2ps %%xmm0,%%xmm0 \n"
+ "mulps %%xmm4,%%xmm0 \n"
+ "cvtps2dq %%xmm0,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movd %%xmm0,(%2) \n"
+ "lea 0x4(%2),%2 \n"
+ "sub $0x1,%3 \n"
+ "jge 10b \n"
"19: \n"
: "+r"(topleft), // %0
"+r"(botleft), // %1
@@ -5924,70 +6021,70 @@ void ARGBAffineRow_SSE2(const uint8_t* src_argb,
intptr_t src_argb_stride_temp = src_argb_stride;
intptr_t temp;
asm volatile(
- "movq (%3),%%xmm2 \n"
- "movq 0x08(%3),%%xmm7 \n"
- "shl $0x10,%1 \n"
- "add $0x4,%1 \n"
- "movd %1,%%xmm5 \n"
- "sub $0x4,%4 \n"
- "jl 49f \n"
+ "movq (%3),%%xmm2 \n"
+ "movq 0x08(%3),%%xmm7 \n"
+ "shl $0x10,%1 \n"
+ "add $0x4,%1 \n"
+ "movd %1,%%xmm5 \n"
+ "sub $0x4,%4 \n"
+ "jl 49f \n"
- "pshufd $0x44,%%xmm7,%%xmm7 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "movdqa %%xmm2,%%xmm0 \n"
- "addps %%xmm7,%%xmm0 \n"
- "movlhps %%xmm0,%%xmm2 \n"
- "movdqa %%xmm7,%%xmm4 \n"
- "addps %%xmm4,%%xmm4 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "addps %%xmm4,%%xmm3 \n"
- "addps %%xmm4,%%xmm4 \n"
+ "pshufd $0x44,%%xmm7,%%xmm7 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "movdqa %%xmm2,%%xmm0 \n"
+ "addps %%xmm7,%%xmm0 \n"
+ "movlhps %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm7,%%xmm4 \n"
+ "addps %%xmm4,%%xmm4 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "addps %%xmm4,%%xmm3 \n"
+ "addps %%xmm4,%%xmm4 \n"
// 4 pixel loop
LABELALIGN
"40: \n"
- "cvttps2dq %%xmm2,%%xmm0 \n" // x,y float->int first 2
- "cvttps2dq %%xmm3,%%xmm1 \n" // x,y float->int next 2
- "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts
- "pmaddwd %%xmm5,%%xmm0 \n" // off = x*4 + y*stride
- "movd %%xmm0,%k1 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
- "movd %%xmm0,%k5 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
- "movd 0x00(%0,%1,1),%%xmm1 \n"
- "movd 0x00(%0,%5,1),%%xmm6 \n"
- "punpckldq %%xmm6,%%xmm1 \n"
- "addps %%xmm4,%%xmm2 \n"
- "movq %%xmm1,(%2) \n"
- "movd %%xmm0,%k1 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
- "movd %%xmm0,%k5 \n"
- "movd 0x00(%0,%1,1),%%xmm0 \n"
- "movd 0x00(%0,%5,1),%%xmm6 \n"
- "punpckldq %%xmm6,%%xmm0 \n"
- "addps %%xmm4,%%xmm3 \n"
- "movq %%xmm0,0x08(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%4 \n"
- "jge 40b \n"
+ "cvttps2dq %%xmm2,%%xmm0 \n" // x,y float->int first 2
+ "cvttps2dq %%xmm3,%%xmm1 \n" // x,y float->int next 2
+ "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts
+ "pmaddwd %%xmm5,%%xmm0 \n" // off = x*4 + y*stride
+ "movd %%xmm0,%k1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+ "movd %%xmm0,%k5 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+ "movd 0x00(%0,%1,1),%%xmm1 \n"
+ "movd 0x00(%0,%5,1),%%xmm6 \n"
+ "punpckldq %%xmm6,%%xmm1 \n"
+ "addps %%xmm4,%%xmm2 \n"
+ "movq %%xmm1,(%2) \n"
+ "movd %%xmm0,%k1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+ "movd %%xmm0,%k5 \n"
+ "movd 0x00(%0,%1,1),%%xmm0 \n"
+ "movd 0x00(%0,%5,1),%%xmm6 \n"
+ "punpckldq %%xmm6,%%xmm0 \n"
+ "addps %%xmm4,%%xmm3 \n"
+ "movq %%xmm0,0x08(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%4 \n"
+ "jge 40b \n"
"49: \n"
- "add $0x3,%4 \n"
- "jl 19f \n"
+ "add $0x3,%4 \n"
+ "jl 19f \n"
// 1 pixel loop
LABELALIGN
"10: \n"
- "cvttps2dq %%xmm2,%%xmm0 \n"
- "packssdw %%xmm0,%%xmm0 \n"
- "pmaddwd %%xmm5,%%xmm0 \n"
- "addps %%xmm7,%%xmm2 \n"
- "movd %%xmm0,%k1 \n"
- "movd 0x00(%0,%1,1),%%xmm0 \n"
- "movd %%xmm0,(%2) \n"
- "lea 0x04(%2),%2 \n"
- "sub $0x1,%4 \n"
- "jge 10b \n"
+ "cvttps2dq %%xmm2,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "pmaddwd %%xmm5,%%xmm0 \n"
+ "addps %%xmm7,%%xmm2 \n"
+ "movd %%xmm0,%k1 \n"
+ "movd 0x00(%0,%1,1),%%xmm0 \n"
+ "movd %%xmm0,(%2) \n"
+ "lea 0x04(%2),%2 \n"
+ "sub $0x1,%4 \n"
+ "jge 10b \n"
"19: \n"
: "+r"(src_argb), // %0
"+r"(src_argb_stride_temp), // %1
@@ -6009,68 +6106,68 @@ void InterpolateRow_SSSE3(uint8_t* dst_ptr,
int dst_width,
int source_y_fraction) {
asm volatile(
- "sub %1,%0 \n"
- "cmp $0x0,%3 \n"
- "je 100f \n"
- "cmp $0x80,%3 \n"
- "je 50f \n"
+ "sub %1,%0 \n"
+ "cmp $0x0,%3 \n"
+ "je 100f \n"
+ "cmp $0x80,%3 \n"
+ "je 50f \n"
- "movd %3,%%xmm0 \n"
- "neg %3 \n"
- "add $0x100,%3 \n"
- "movd %3,%%xmm5 \n"
- "punpcklbw %%xmm0,%%xmm5 \n"
- "punpcklwd %%xmm5,%%xmm5 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "mov $0x80808080,%%eax \n"
- "movd %%eax,%%xmm4 \n"
- "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "movd %3,%%xmm0 \n"
+ "neg %3 \n"
+ "add $0x100,%3 \n"
+ "movd %3,%%xmm5 \n"
+ "punpcklbw %%xmm0,%%xmm5 \n"
+ "punpcklwd %%xmm5,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "mov $0x80808080,%%eax \n"
+ "movd %%eax,%%xmm4 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
// General purpose row blend.
LABELALIGN
"1: \n"
- "movdqu (%1),%%xmm0 \n"
- "movdqu 0x00(%1,%4,1),%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm2,%%xmm0 \n"
- "punpckhbw %%xmm2,%%xmm1 \n"
- "psubb %%xmm4,%%xmm0 \n"
- "psubb %%xmm4,%%xmm1 \n"
- "movdqa %%xmm5,%%xmm2 \n"
- "movdqa %%xmm5,%%xmm3 \n"
- "pmaddubsw %%xmm0,%%xmm2 \n"
- "pmaddubsw %%xmm1,%%xmm3 \n"
- "paddw %%xmm4,%%xmm2 \n"
- "paddw %%xmm4,%%xmm3 \n"
- "psrlw $0x8,%%xmm2 \n"
- "psrlw $0x8,%%xmm3 \n"
- "packuswb %%xmm3,%%xmm2 \n"
- "movdqu %%xmm2,0x00(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- "jmp 99f \n"
+ "movdqu (%1),%%xmm0 \n"
+ "movdqu 0x00(%1,%4,1),%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "psubb %%xmm4,%%xmm0 \n"
+ "psubb %%xmm4,%%xmm1 \n"
+ "movdqa %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm5,%%xmm3 \n"
+ "pmaddubsw %%xmm0,%%xmm2 \n"
+ "pmaddubsw %%xmm1,%%xmm3 \n"
+ "paddw %%xmm4,%%xmm2 \n"
+ "paddw %%xmm4,%%xmm3 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "psrlw $0x8,%%xmm3 \n"
+ "packuswb %%xmm3,%%xmm2 \n"
+ "movdqu %%xmm2,0x00(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "jmp 99f \n"
// Blend 50 / 50.
LABELALIGN
"50: \n"
- "movdqu (%1),%%xmm0 \n"
- "movdqu 0x00(%1,%4,1),%%xmm1 \n"
- "pavgb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,0x00(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 50b \n"
- "jmp 99f \n"
+ "movdqu (%1),%%xmm0 \n"
+ "movdqu 0x00(%1,%4,1),%%xmm1 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,0x00(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 50b \n"
+ "jmp 99f \n"
// Blend 100 / 0 - Copy row unchanged.
LABELALIGN
"100: \n"
- "movdqu (%1),%%xmm0 \n"
- "movdqu %%xmm0,0x00(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 100b \n"
+ "movdqu (%1),%%xmm0 \n"
+ "movdqu %%xmm0,0x00(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 100b \n"
"99: \n"
: "+r"(dst_ptr), // %0
@@ -6090,61 +6187,61 @@ void InterpolateRow_AVX2(uint8_t* dst_ptr,
int dst_width,
int source_y_fraction) {
asm volatile(
- "cmp $0x0,%3 \n"
- "je 100f \n"
- "sub %1,%0 \n"
- "cmp $0x80,%3 \n"
- "je 50f \n"
+ "cmp $0x0,%3 \n"
+ "je 100f \n"
+ "sub %1,%0 \n"
+ "cmp $0x80,%3 \n"
+ "je 50f \n"
- "vmovd %3,%%xmm0 \n"
- "neg %3 \n"
- "add $0x100,%3 \n"
- "vmovd %3,%%xmm5 \n"
- "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n"
- "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n"
+ "vmovd %3,%%xmm0 \n"
+ "neg %3 \n"
+ "add $0x100,%3 \n"
+ "vmovd %3,%%xmm5 \n"
+ "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n"
+ "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n"
"vbroadcastss %%xmm5,%%ymm5 \n"
- "mov $0x80808080,%%eax \n"
- "vmovd %%eax,%%xmm4 \n"
+ "mov $0x80808080,%%eax \n"
+ "vmovd %%eax,%%xmm4 \n"
"vbroadcastss %%xmm4,%%ymm4 \n"
// General purpose row blend.
LABELALIGN
"1: \n"
- "vmovdqu (%1),%%ymm0 \n"
- "vmovdqu 0x00(%1,%4,1),%%ymm2 \n"
- "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n"
- "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpsubb %%ymm4,%%ymm1,%%ymm1 \n"
- "vpsubb %%ymm4,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n"
- "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n"
- "vpaddw %%ymm4,%%ymm1,%%ymm1 \n"
- "vpaddw %%ymm4,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "jmp 99f \n"
+ "vmovdqu (%1),%%ymm0 \n"
+ "vmovdqu 0x00(%1,%4,1),%%ymm2 \n"
+ "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n"
+ "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpsubb %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpsubb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n"
+ "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n"
+ "vpaddw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpaddw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "jmp 99f \n"
// Blend 50 / 50.
LABELALIGN
"50: \n"
- "vmovdqu (%1),%%ymm0 \n"
- "vpavgb 0x00(%1,%4,1),%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 50b \n"
- "jmp 99f \n"
+ "vmovdqu (%1),%%ymm0 \n"
+ "vpavgb 0x00(%1,%4,1),%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 50b \n"
+ "jmp 99f \n"
// Blend 100 / 0 - Copy row unchanged.
LABELALIGN
"100: \n"
- "rep movsb \n"
- "jmp 999f \n"
+ "rep movsb \n"
+ "jmp 999f \n"
"99: \n"
"vzeroupper \n"
@@ -6166,20 +6263,20 @@ void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
int width) {
asm volatile(
- "movdqu (%3),%%xmm5 \n"
+ "movdqu (%3),%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pshufb %%xmm5,%%xmm0 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -6200,16 +6297,16 @@ void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
- "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vmovdqu %%ymm1,0x20(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
@@ -6227,24 +6324,24 @@ void I422ToYUY2Row_SSE2(const uint8_t* src_y,
int width) {
asm volatile(
- "sub %1,%2 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movq (%1),%%xmm2 \n"
- "movq 0x00(%1,%2,1),%%xmm1 \n"
- "add $0x8,%1 \n"
- "punpcklbw %%xmm1,%%xmm2 \n"
- "movdqu (%0),%%xmm0 \n"
- "add $0x10,%0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm2,%%xmm0 \n"
- "punpckhbw %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0,(%3) \n"
- "movdqu %%xmm1,0x10(%3) \n"
- "lea 0x20(%3),%3 \n"
- "sub $0x10,%4 \n"
- "jg 1b \n"
+ "movq (%1),%%xmm2 \n"
+ "movq 0x00(%1,%2,1),%%xmm1 \n"
+ "add $0x8,%1 \n"
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "movdqu (%0),%%xmm0 \n"
+ "add $0x10,%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0,(%3) \n"
+ "movdqu %%xmm1,0x10(%3) \n"
+ "lea 0x20(%3),%3 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -6263,24 +6360,24 @@ void I422ToUYVYRow_SSE2(const uint8_t* src_y,
int width) {
asm volatile(
- "sub %1,%2 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movq (%1),%%xmm2 \n"
- "movq 0x00(%1,%2,1),%%xmm1 \n"
- "add $0x8,%1 \n"
- "punpcklbw %%xmm1,%%xmm2 \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "add $0x10,%0 \n"
- "punpcklbw %%xmm0,%%xmm1 \n"
- "punpckhbw %%xmm0,%%xmm2 \n"
- "movdqu %%xmm1,(%3) \n"
- "movdqu %%xmm2,0x10(%3) \n"
- "lea 0x20(%3),%3 \n"
- "sub $0x10,%4 \n"
- "jg 1b \n"
+ "movq (%1),%%xmm2 \n"
+ "movq 0x00(%1,%2,1),%%xmm1 \n"
+ "add $0x8,%1 \n"
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "add $0x10,%0 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpckhbw %%xmm0,%%xmm2 \n"
+ "movdqu %%xmm1,(%3) \n"
+ "movdqu %%xmm2,0x10(%3) \n"
+ "lea 0x20(%3),%3 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -6299,26 +6396,26 @@ void I422ToYUY2Row_AVX2(const uint8_t* src_y,
int width) {
asm volatile(
- "sub %1,%2 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "vpmovzxbw (%1),%%ymm1 \n"
- "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
- "add $0x10,%1 \n"
- "vpsllw $0x8,%%ymm2,%%ymm2 \n"
- "vpor %%ymm1,%%ymm2,%%ymm2 \n"
- "vmovdqu (%0),%%ymm0 \n"
- "add $0x20,%0 \n"
- "vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n"
- "vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n"
+ "vpmovzxbw (%1),%%ymm1 \n"
+ "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
+ "add $0x10,%1 \n"
+ "vpsllw $0x8,%%ymm2,%%ymm2 \n"
+ "vpor %%ymm1,%%ymm2,%%ymm2 \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "add $0x20,%0 \n"
+ "vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n"
+ "vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n"
"vextractf128 $0x0,%%ymm1,(%3) \n"
"vextractf128 $0x0,%%ymm2,0x10(%3) \n"
"vextractf128 $0x1,%%ymm1,0x20(%3) \n"
"vextractf128 $0x1,%%ymm2,0x30(%3) \n"
- "lea 0x40(%3),%3 \n"
- "sub $0x20,%4 \n"
- "jg 1b \n"
+ "lea 0x40(%3),%3 \n"
+ "sub $0x20,%4 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
@@ -6338,26 +6435,26 @@ void I422ToUYVYRow_AVX2(const uint8_t* src_y,
int width) {
asm volatile(
- "sub %1,%2 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "vpmovzxbw (%1),%%ymm1 \n"
- "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
- "add $0x10,%1 \n"
- "vpsllw $0x8,%%ymm2,%%ymm2 \n"
- "vpor %%ymm1,%%ymm2,%%ymm2 \n"
- "vmovdqu (%0),%%ymm0 \n"
- "add $0x20,%0 \n"
- "vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n"
- "vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n"
+ "vpmovzxbw (%1),%%ymm1 \n"
+ "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
+ "add $0x10,%1 \n"
+ "vpsllw $0x8,%%ymm2,%%ymm2 \n"
+ "vpor %%ymm1,%%ymm2,%%ymm2 \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "add $0x20,%0 \n"
+ "vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n"
+ "vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n"
"vextractf128 $0x0,%%ymm1,(%3) \n"
"vextractf128 $0x0,%%ymm2,0x10(%3) \n"
"vextractf128 $0x1,%%ymm1,0x20(%3) \n"
"vextractf128 $0x1,%%ymm2,0x30(%3) \n"
- "lea 0x40(%3),%3 \n"
- "sub $0x20,%4 \n"
- "jg 1b \n"
+ "lea 0x40(%3),%3 \n"
+ "sub $0x20,%4 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
@@ -6376,47 +6473,47 @@ void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
int width) {
asm volatile(
- "pxor %%xmm3,%%xmm3 \n"
+ "pxor %%xmm3,%%xmm3 \n"
// 2 pixel loop.
LABELALIGN
"1: \n"
- "movq (%0),%%xmm0 \n"
- "lea 0x8(%0),%0 \n"
- "punpcklbw %%xmm3,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm4 \n"
- "punpcklwd %%xmm3,%%xmm0 \n"
- "punpckhwd %%xmm3,%%xmm4 \n"
- "cvtdq2ps %%xmm0,%%xmm0 \n"
- "cvtdq2ps %%xmm4,%%xmm4 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "mulps 0x10(%3),%%xmm0 \n"
- "mulps 0x10(%3),%%xmm4 \n"
- "addps (%3),%%xmm0 \n"
- "addps (%3),%%xmm4 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "movdqa %%xmm5,%%xmm6 \n"
- "mulps %%xmm1,%%xmm2 \n"
- "mulps %%xmm5,%%xmm6 \n"
- "mulps %%xmm2,%%xmm1 \n"
- "mulps %%xmm6,%%xmm5 \n"
- "mulps 0x20(%3),%%xmm2 \n"
- "mulps 0x20(%3),%%xmm6 \n"
- "mulps 0x30(%3),%%xmm1 \n"
- "mulps 0x30(%3),%%xmm5 \n"
- "addps %%xmm2,%%xmm0 \n"
- "addps %%xmm6,%%xmm4 \n"
- "addps %%xmm1,%%xmm0 \n"
- "addps %%xmm5,%%xmm4 \n"
- "cvttps2dq %%xmm0,%%xmm0 \n"
- "cvttps2dq %%xmm4,%%xmm4 \n"
- "packuswb %%xmm4,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x2,%2 \n"
- "jg 1b \n"
+ "movq (%0),%%xmm0 \n"
+ "lea 0x8(%0),%0 \n"
+ "punpcklbw %%xmm3,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm4 \n"
+ "punpcklwd %%xmm3,%%xmm0 \n"
+ "punpckhwd %%xmm3,%%xmm4 \n"
+ "cvtdq2ps %%xmm0,%%xmm0 \n"
+ "cvtdq2ps %%xmm4,%%xmm4 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "mulps 0x10(%3),%%xmm0 \n"
+ "mulps 0x10(%3),%%xmm4 \n"
+ "addps (%3),%%xmm0 \n"
+ "addps (%3),%%xmm4 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "movdqa %%xmm5,%%xmm6 \n"
+ "mulps %%xmm1,%%xmm2 \n"
+ "mulps %%xmm5,%%xmm6 \n"
+ "mulps %%xmm2,%%xmm1 \n"
+ "mulps %%xmm6,%%xmm5 \n"
+ "mulps 0x20(%3),%%xmm2 \n"
+ "mulps 0x20(%3),%%xmm6 \n"
+ "mulps 0x30(%3),%%xmm1 \n"
+ "mulps 0x30(%3),%%xmm5 \n"
+ "addps %%xmm2,%%xmm0 \n"
+ "addps %%xmm6,%%xmm4 \n"
+ "addps %%xmm1,%%xmm0 \n"
+ "addps %%xmm5,%%xmm4 \n"
+ "cvttps2dq %%xmm0,%%xmm0 \n"
+ "cvttps2dq %%xmm4,%%xmm4 \n"
+ "packuswb %%xmm4,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x2,%2 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -6512,27 +6609,27 @@ void HalfFloatRow_AVX2(const uint16_t* src,
int width) {
scale *= kScaleBias;
asm volatile(
- "vbroadcastss %3, %%ymm4 \n"
- "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
- "sub %0,%1 \n"
+ "vbroadcastss %3, %%ymm4 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+ "sub %0,%1 \n"
// 16 pixel loop.
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm2 \n" // 16 shorts
- "add $0x20,%0 \n"
- "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates
- "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n"
- "vcvtdq2ps %%ymm3,%%ymm3 \n"
- "vcvtdq2ps %%ymm2,%%ymm2 \n"
- "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
- "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
- "vpsrld $0xd,%%ymm3,%%ymm3 \n"
- "vpsrld $0xd,%%ymm2,%%ymm2 \n"
- "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates
- "vmovdqu %%ymm2,-0x20(%0,%1,1) \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm2 \n" // 16 shorts
+ "add $0x20,%0 \n"
+ "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates
+ "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n"
+ "vcvtdq2ps %%ymm3,%%ymm3 \n"
+ "vcvtdq2ps %%ymm2,%%ymm2 \n"
+ "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
+ "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
+ "vpsrld $0xd,%%ymm3,%%ymm3 \n"
+ "vpsrld $0xd,%%ymm2,%%ymm2 \n"
+ "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates
+ "vmovdqu %%ymm2,-0x20(%0,%1,1) \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
@@ -6553,8 +6650,8 @@ void HalfFloatRow_F16C(const uint16_t* src,
float scale,
int width) {
asm volatile(
- "vbroadcastss %3, %%ymm4 \n"
- "sub %0,%1 \n"
+ "vbroadcastss %3, %%ymm4 \n"
+ "sub %0,%1 \n"
// 16 pixel loop.
LABELALIGN
@@ -6588,7 +6685,7 @@ void HalfFloatRow_F16C(const uint16_t* src,
#ifdef HAS_HALFFLOATROW_F16C
void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) {
asm volatile(
- "sub %0,%1 \n"
+ "sub %0,%1 \n"
// 16 pixel loop.
LABELALIGN
"1: \n"
@@ -6622,21 +6719,21 @@ void ARGBColorTableRow_X86(uint8_t* dst_argb,
// 1 pixel loop.
LABELALIGN
"1: \n"
- "movzb (%0),%1 \n"
- "lea 0x4(%0),%0 \n"
- "movzb 0x00(%3,%1,4),%1 \n"
- "mov %b1,-0x4(%0) \n"
- "movzb -0x3(%0),%1 \n"
- "movzb 0x01(%3,%1,4),%1 \n"
- "mov %b1,-0x3(%0) \n"
- "movzb -0x2(%0),%1 \n"
- "movzb 0x02(%3,%1,4),%1 \n"
- "mov %b1,-0x2(%0) \n"
- "movzb -0x1(%0),%1 \n"
- "movzb 0x03(%3,%1,4),%1 \n"
- "mov %b1,-0x1(%0) \n"
- "dec %2 \n"
- "jg 1b \n"
+ "movzb (%0),%1 \n"
+ "lea 0x4(%0),%0 \n"
+ "movzb 0x00(%3,%1,4),%1 \n"
+ "mov %b1,-0x4(%0) \n"
+ "movzb -0x3(%0),%1 \n"
+ "movzb 0x01(%3,%1,4),%1 \n"
+ "mov %b1,-0x3(%0) \n"
+ "movzb -0x2(%0),%1 \n"
+ "movzb 0x02(%3,%1,4),%1 \n"
+ "mov %b1,-0x2(%0) \n"
+ "movzb -0x1(%0),%1 \n"
+ "movzb 0x03(%3,%1,4),%1 \n"
+ "mov %b1,-0x1(%0) \n"
+ "dec %2 \n"
+ "jg 1b \n"
: "+r"(dst_argb), // %0
"=&d"(pixel_temp), // %1
"+r"(width) // %2
@@ -6655,18 +6752,18 @@ void RGBColorTableRow_X86(uint8_t* dst_argb,
// 1 pixel loop.
LABELALIGN
"1: \n"
- "movzb (%0),%1 \n"
- "lea 0x4(%0),%0 \n"
- "movzb 0x00(%3,%1,4),%1 \n"
- "mov %b1,-0x4(%0) \n"
- "movzb -0x3(%0),%1 \n"
- "movzb 0x01(%3,%1,4),%1 \n"
- "mov %b1,-0x3(%0) \n"
- "movzb -0x2(%0),%1 \n"
- "movzb 0x02(%3,%1,4),%1 \n"
- "mov %b1,-0x2(%0) \n"
- "dec %2 \n"
- "jg 1b \n"
+ "movzb (%0),%1 \n"
+ "lea 0x4(%0),%0 \n"
+ "movzb 0x00(%3,%1,4),%1 \n"
+ "mov %b1,-0x4(%0) \n"
+ "movzb -0x3(%0),%1 \n"
+ "movzb 0x01(%3,%1,4),%1 \n"
+ "mov %b1,-0x3(%0) \n"
+ "movzb -0x2(%0),%1 \n"
+ "movzb 0x02(%3,%1,4),%1 \n"
+ "mov %b1,-0x2(%0) \n"
+ "dec %2 \n"
+ "jg 1b \n"
: "+r"(dst_argb), // %0
"=&d"(pixel_temp), // %1
"+r"(width) // %2
@@ -6685,86 +6782,86 @@ void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
uintptr_t pixel_temp;
uintptr_t table_temp;
asm volatile(
- "movd %6,%%xmm3 \n"
- "pshufd $0x0,%%xmm3,%%xmm3 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psllw $0x8,%%xmm4 \n"
- "pxor %%xmm5,%%xmm5 \n"
+ "movd %6,%%xmm3 \n"
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psllw $0x8,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
// 4 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%2),%%xmm0 \n"
- "pmaddubsw %%xmm3,%%xmm0 \n"
- "phaddw %%xmm0,%%xmm0 \n"
- "pand %%xmm4,%%xmm0 \n"
- "punpcklwd %%xmm5,%%xmm0 \n"
- "movd %%xmm0,%k1 \n" // 32 bit offset
- "add %5,%1 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
+ "movdqu (%2),%%xmm0 \n"
+ "pmaddubsw %%xmm3,%%xmm0 \n"
+ "phaddw %%xmm0,%%xmm0 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "punpcklwd %%xmm5,%%xmm0 \n"
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
- "movzb (%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,(%3) \n"
- "movzb 0x1(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0x1(%3) \n"
- "movzb 0x2(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0x2(%3) \n"
- "movzb 0x3(%2),%0 \n"
- "mov %b0,0x3(%3) \n"
+ "movzb (%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,(%3) \n"
+ "movzb 0x1(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x1(%3) \n"
+ "movzb 0x2(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x2(%3) \n"
+ "movzb 0x3(%2),%0 \n"
+ "mov %b0,0x3(%3) \n"
- "movd %%xmm0,%k1 \n" // 32 bit offset
- "add %5,%1 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
- "movzb 0x4(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0x4(%3) \n"
- "movzb 0x5(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0x5(%3) \n"
- "movzb 0x6(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0x6(%3) \n"
- "movzb 0x7(%2),%0 \n"
- "mov %b0,0x7(%3) \n"
+ "movzb 0x4(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x4(%3) \n"
+ "movzb 0x5(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x5(%3) \n"
+ "movzb 0x6(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x6(%3) \n"
+ "movzb 0x7(%2),%0 \n"
+ "mov %b0,0x7(%3) \n"
- "movd %%xmm0,%k1 \n" // 32 bit offset
- "add %5,%1 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
- "movzb 0x8(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0x8(%3) \n"
- "movzb 0x9(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0x9(%3) \n"
- "movzb 0xa(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0xa(%3) \n"
- "movzb 0xb(%2),%0 \n"
- "mov %b0,0xb(%3) \n"
+ "movzb 0x8(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x8(%3) \n"
+ "movzb 0x9(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x9(%3) \n"
+ "movzb 0xa(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0xa(%3) \n"
+ "movzb 0xb(%2),%0 \n"
+ "mov %b0,0xb(%3) \n"
- "movd %%xmm0,%k1 \n" // 32 bit offset
- "add %5,%1 \n"
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
- "movzb 0xc(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0xc(%3) \n"
- "movzb 0xd(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0xd(%3) \n"
- "movzb 0xe(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0xe(%3) \n"
- "movzb 0xf(%2),%0 \n"
- "mov %b0,0xf(%3) \n"
- "lea 0x10(%2),%2 \n"
- "lea 0x10(%3),%3 \n"
- "sub $0x4,%4 \n"
- "jg 1b \n"
+ "movzb 0xc(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0xc(%3) \n"
+ "movzb 0xd(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0xd(%3) \n"
+ "movzb 0xe(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0xe(%3) \n"
+ "movzb 0xf(%2),%0 \n"
+ "mov %b0,0xf(%3) \n"
+ "lea 0x10(%2),%2 \n"
+ "lea 0x10(%3),%3 \n"
+ "sub $0x4,%4 \n"
+ "jg 1b \n"
: "=&d"(pixel_temp), // %0
"=&a"(table_temp), // %1
"+r"(src_argb), // %2
@@ -6837,46 +6934,47 @@ void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
src_y_ptr = (uint8_t*)src_y;
asm volatile(
- "vmovdqu %5, %%ymm0 \n" // init blend value
- "vmovdqu %6, %%ymm1 \n" // init blend value
- "vmovdqu %7, %%ymm2 \n" // init blend value
- // "sub $0x20, %3 \n" //sub 32 from width for final loop
+ "vmovdqu %5, %%ymm0 \n" // init blend value
+ "vmovdqu %6, %%ymm1 \n" // init blend value
+ "vmovdqu %7, %%ymm2 \n" // init blend value
+ // "sub $0x20, %3 \n" //sub 32 from
+ // width for final loop
LABELALIGN
- "1: \n" // label 1
- "vmovdqu (%0,%4), %%ymm3 \n" // src_y
- "vmovdqu 1(%1,%4), %%ymm4 \n" // src_uv+1
- "vmovdqu (%1), %%ymm5 \n" // src_uv
- "vpshufb %8, %%ymm3, %%ymm13 \n" // y, kSHUF0 for shuf
- "vpshufb %9, %%ymm4, %%ymm14 \n" // uv+1, kSHUF1 for
- // shuf
- "vpshufb %10, %%ymm5, %%ymm15 \n" // uv, kSHUF2 for
- // shuf
- "vpshufb %11, %%ymm3, %%ymm3 \n" // y kSHUF3 for shuf
- "vpshufb %12, %%ymm4, %%ymm4 \n" // uv+1 kSHUF4 for
- // shuf
- "vpblendvb %%ymm0, %%ymm14, %%ymm13, %%ymm12 \n" // blend 0
- "vpblendvb %%ymm0, %%ymm13, %%ymm14, %%ymm14 \n" // blend 0
- "vpblendvb %%ymm2, %%ymm15, %%ymm12, %%ymm12 \n" // blend 2
- "vpblendvb %%ymm1, %%ymm15, %%ymm14, %%ymm13 \n" // blend 1
- "vpshufb %13, %%ymm5, %%ymm15 \n" // shuffle const
- "vpor %%ymm4, %%ymm3, %%ymm5 \n" // get results
- "vmovdqu %%ymm12, 0x20(%2) \n" // store dst_yuv+20h
- "vpor %%ymm15, %%ymm5, %%ymm3 \n" // get results
- "add $0x20, %4 \n" // add to src buffer
- // ptr
- "vinserti128 $0x1, %%xmm3, %%ymm13, %%ymm4 \n" // insert
- "vperm2i128 $0x31, %%ymm13, %%ymm3, %%ymm5 \n" // insert
- "vmovdqu %%ymm4, (%2) \n" // store dst_yuv
- "vmovdqu %%ymm5, 0x40(%2) \n" // store dst_yuv+40h
- "add $0x60,%2 \n" // add to dst buffer
- // ptr
- // "cmp %3, %4 \n" //(width64 -
+ "1: \n" // label 1
+ "vmovdqu (%0,%4), %%ymm3 \n" // src_y
+ "vmovdqu 1(%1,%4), %%ymm4 \n" // src_uv+1
+ "vmovdqu (%1), %%ymm5 \n" // src_uv
+ "vpshufb %8, %%ymm3, %%ymm13 \n" // y, kSHUF0 for shuf
+ "vpshufb %9, %%ymm4, %%ymm14 \n" // uv+1, kSHUF1 for
+ // shuf
+ "vpshufb %10, %%ymm5, %%ymm15 \n" // uv, kSHUF2 for
+ // shuf
+ "vpshufb %11, %%ymm3, %%ymm3 \n" // y kSHUF3 for shuf
+ "vpshufb %12, %%ymm4, %%ymm4 \n" // uv+1 kSHUF4 for
+ // shuf
+ "vpblendvb %%ymm0, %%ymm14, %%ymm13, %%ymm12 \n" // blend 0
+ "vpblendvb %%ymm0, %%ymm13, %%ymm14, %%ymm14 \n" // blend 0
+ "vpblendvb %%ymm2, %%ymm15, %%ymm12, %%ymm12 \n" // blend 2
+ "vpblendvb %%ymm1, %%ymm15, %%ymm14, %%ymm13 \n" // blend 1
+ "vpshufb %13, %%ymm5, %%ymm15 \n" // shuffle const
+ "vpor %%ymm4, %%ymm3, %%ymm5 \n" // get results
+ "vmovdqu %%ymm12, 0x20(%2) \n" // store dst_yuv+20h
+ "vpor %%ymm15, %%ymm5, %%ymm3 \n" // get results
+ "add $0x20, %4 \n" // add to src buffer
+ // ptr
+ "vinserti128 $0x1, %%xmm3, %%ymm13, %%ymm4 \n" // insert
+ "vperm2i128 $0x31, %%ymm13, %%ymm3, %%ymm5 \n" // insert
+ "vmovdqu %%ymm4, (%2) \n" // store dst_yuv
+ "vmovdqu %%ymm5, 0x40(%2) \n" // store dst_yuv+40h
+ "add $0x60,%2 \n" // add to dst buffer
+ // ptr
+ // "cmp %3, %4 \n" //(width64 -
// 32 bytes) and src_offset
- "sub $0x20,%3 \n" // 32 pixels per loop
- "jg 1b \n"
- "vzeroupper \n" // sse-avx2
- // transistions
+ "sub $0x20,%3 \n" // 32 pixels per loop
+ "jg 1b \n"
+ "vzeroupper \n" // sse-avx2
+ // transistions
: "+r"(src_y), //%0
"+r"(src_vu), //%1
@@ -6907,20 +7005,20 @@ static const uvec8 kShuffleUVToVU = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u,
void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
asm volatile(
- "movdqu %3,%%xmm5 \n"
+ "movdqu %3,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pshufb %%xmm5,%%xmm0 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_vu), // %1
"+r"(width) // %2
@@ -6937,16 +7035,16 @@ void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
- "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vmovdqu %%ymm1,0x20(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_uv), // %0
"+r"(dst_vu), // %1
@@ -6956,6 +7054,119 @@ void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
}
#endif // HAS_SWAPUVROW_AVX2
+void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width) {
+ asm volatile(
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrlw $0xf,%%xmm4 \n"
+ "packuswb %%xmm4,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n" // load 16 U values
+ "movdqu (%1),%%xmm1 \n" // load 16 V values
+ "movdqu 0(%0,%4,1),%%xmm2 \n" // 16 from next row
+ "movdqu 0(%1,%5,1),%%xmm3 \n"
+ "lea 0x10(%0),%0 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n" // half size
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea 0x10(%1),%1 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "psrlw $0x1,%%xmm0 \n"
+ "psrlw $0x1,%%xmm1 \n"
+ "pavgw %%xmm5,%%xmm0 \n"
+ "pavgw %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n" // store 8 UV pixels
+ "lea 0x10(%2),%2 \n"
+ "sub $0x10,%3 \n" // 16 src pixels per loop
+ "jg 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ : "r"((intptr_t)(src_stride_u)), // %4
+ "r"((intptr_t)(src_stride_v)) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void HalfMergeUVRow_AVX2(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width) {
+ asm volatile(
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
+ "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n" // load 32 U values
+ "vmovdqu (%1),%%ymm1 \n" // load 32 V values
+ "vmovdqu 0(%0,%4,1),%%ymm2 \n" // 32 from next row
+ "vmovdqu 0(%1,%5,1),%%ymm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // half size
+ "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "lea 0x20(%1),%1 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpsrlw $0x1,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x1,%%ymm1,%%ymm1 \n"
+ "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%2) \n" // store 16 UV pixels
+ "lea 0x20(%2),%2 \n"
+ "sub $0x20,%3 \n" // 32 src pixels per loop
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ : "r"((intptr_t)(src_stride_u)), // %4
+ "r"((intptr_t)(src_stride_v)) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width) {
+ asm volatile(
+ "pxor %%xmm1,%%xmm1 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movd (%0),%%xmm0 \n" // load float
+ "maxss %%xmm1, %%xmm0 \n" // clamp to zero
+ "add 4, %0 \n"
+ "movd %%xmm0, (%1) \n" // store float
+ "add 4, %1 \n"
+ "sub $0x4,%2 \n" // 1 float per loop
+ "jg 1b \n"
+ : "+r"(src_x), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1");
+}
+
#endif // defined(__x86_64__) || defined(__i386__)
#ifdef __cplusplus
diff --git a/TMessagesProj/jni/third_party/libyuv/source/row_mmi.cc b/TMessagesProj/jni/third_party/libyuv/source/row_mmi.cc
index 50cfca726..9a8e2cb2d 100644
--- a/TMessagesProj/jni/third_party/libyuv/source/row_mmi.cc
+++ b/TMessagesProj/jni/third_party/libyuv/source/row_mmi.cc
@@ -21,6 +21,8 @@ extern "C" {
// This module is for Mips MMI.
#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+// clang-format off
+
void RGB24ToARGBRow_MMI(const uint8_t* src_rgb24,
uint8_t* dst_argb,
int width) {
@@ -688,12 +690,15 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
uint8_t* dst_v,
int width) {
uint64_t src_rgb1;
- uint64_t ftmp[12];
+ uint64_t ftmp[13];
+ uint64_t tmp[1];
const uint64_t value = 0x4040;
- const uint64_t mask_u = 0x0026004a00700002;
- const uint64_t mask_v = 0x00020070005e0012;
+ const uint64_t mask_u = 0x0013002500380002;
+ const uint64_t mask_v = 0x00020038002f0009;
__asm__ volatile(
+ "dli %[tmp0], 0x0001000100010001 \n\t"
+ "dmtc1 %[tmp0], %[ftmp12] \n\t"
"1: \n\t"
"daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t"
"gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t"
@@ -707,7 +712,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsll %[dest0_u], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t"
"pinsrh_3 %[dest0_v], %[src0], %[value] \n\t"
@@ -725,7 +731,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsll %[src_lo], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
"pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
@@ -752,7 +759,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsll %[dest1_u], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t"
"pinsrh_3 %[dest1_v], %[src0], %[value] \n\t"
@@ -770,7 +778,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsll %[src_lo], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
"pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
@@ -797,7 +806,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsll %[dest2_u], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t"
"pinsrh_3 %[dest2_v], %[src0], %[value] \n\t"
@@ -815,7 +825,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsll %[src_lo], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
"pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
@@ -842,7 +853,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsll %[dest3_u], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t"
"pinsrh_3 %[dest3_v], %[src0], %[value] \n\t"
@@ -860,7 +872,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsll %[src_lo], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
"pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
@@ -898,11 +911,12 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
[dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
[dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
[dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
- [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
+ [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
+ [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
: [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
[dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
[mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
- [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
+ [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
[sixteen] "f"(0x10)
: "memory");
}
@@ -992,12 +1006,15 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
uint8_t* dst_v,
int width) {
uint64_t src_rgb1;
- uint64_t ftmp[12];
+ uint64_t ftmp[13];
+ uint64_t tmp[1];
const uint64_t value = 0x4040;
- const uint64_t mask_u = 0x00020070004a0026;
- const uint64_t mask_v = 0x0012005e00700002;
+ const uint64_t mask_u = 0x0002003800250013;
+ const uint64_t mask_v = 0x0009002f00380002;
__asm__ volatile(
+ "dli %[tmp0], 0x0001000100010001 \n\t"
+ "dmtc1 %[tmp0], %[ftmp12] \n\t"
"1: \n\t"
"daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t"
"gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t"
@@ -1011,7 +1028,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsrl %[dest0_u], %[src0], %[sixteen] \n\t"
"pinsrh_3 %[dest0_u], %[dest0_u], %[value] \n\t"
"pinsrh_0 %[dest0_v], %[src0], %[value] \n\t"
@@ -1029,7 +1047,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsrl %[src_lo], %[src0], %[sixteen] \n\t"
"pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
"pinsrh_0 %[src_hi], %[src0], %[value] \n\t"
@@ -1056,7 +1075,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsrl %[dest1_u], %[src0], %[sixteen] \n\t"
"pinsrh_3 %[dest1_u], %[dest1_u], %[value] \n\t"
"pinsrh_0 %[dest1_v], %[src0], %[value] \n\t"
@@ -1074,7 +1094,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsrl %[src_lo], %[src0], %[sixteen] \n\t"
"pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
"pinsrh_0 %[src_hi], %[src0], %[value] \n\t"
@@ -1101,7 +1122,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsrl %[dest2_u], %[src0], %[sixteen] \n\t"
"pinsrh_3 %[dest2_u], %[dest2_u], %[value] \n\t"
"pinsrh_0 %[dest2_v], %[src0], %[value] \n\t"
@@ -1119,7 +1141,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsrl %[src_lo], %[src0], %[sixteen] \n\t"
"pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
"pinsrh_0 %[src_hi], %[src0], %[value] \n\t"
@@ -1146,7 +1169,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsrl %[dest3_u], %[src0], %[sixteen] \n\t"
"pinsrh_3 %[dest3_u], %[dest3_u], %[value] \n\t"
"pinsrh_0 %[dest3_v], %[src0], %[value] \n\t"
@@ -1164,7 +1188,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsrl %[src_lo], %[src0], %[sixteen] \n\t"
"pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
"pinsrh_0 %[src_hi], %[src0], %[value] \n\t"
@@ -1202,11 +1227,12 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
[dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
[dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
[dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
- [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
+ [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
+ [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
: [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
[dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
[mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
- [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
+ [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
[sixteen] "f"(0x10)
: "memory");
}
@@ -1296,12 +1322,15 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
uint8_t* dst_v,
int width) {
uint64_t src_rgb1;
- uint64_t ftmp[12];
+ uint64_t ftmp[13];
+ uint64_t tmp[1];
const uint64_t value = 0x4040;
- const uint64_t mask_u = 0x00020070004a0026;
- const uint64_t mask_v = 0x0012005e00700002;
+ const uint64_t mask_u = 0x0002003800250013;
+ const uint64_t mask_v = 0x0009002F00380002;
__asm__ volatile(
+ "dli %[tmp0], 0x0001000100010001 \n\t"
+ "dmtc1 %[tmp0], %[ftmp12] \n\t"
"1: \n\t"
"daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t"
"gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t"
@@ -1315,7 +1344,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_3 %[dest0_u], %[src0], %[value] \n\t"
"dsll %[dest0_v], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[dest0_v], %[dest0_v], %[value] \n\t"
@@ -1333,7 +1363,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
"dsll %[src_hi], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
@@ -1360,7 +1391,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_3 %[dest1_u], %[src0], %[value] \n\t"
"dsll %[dest1_v], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[dest1_v], %[dest1_v], %[value] \n\t"
@@ -1378,7 +1410,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
"dsll %[src_hi], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
@@ -1405,7 +1438,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_3 %[dest2_u], %[src0], %[value] \n\t"
"dsll %[dest2_v], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[dest2_v], %[dest2_v], %[value] \n\t"
@@ -1423,7 +1457,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
"dsll %[src_hi], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
@@ -1450,7 +1485,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_3 %[dest3_u], %[src0], %[value] \n\t"
"dsll %[dest3_v], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[dest3_v], %[dest3_v], %[value] \n\t"
@@ -1468,7 +1504,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
"dsll %[src_hi], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
@@ -1506,11 +1543,12 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
[dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
[dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
[dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
- [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
+ [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
+ [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
: [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
[dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
[mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
- [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
+ [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
[sixteen] "f"(0x10)
: "memory");
}
@@ -1600,12 +1638,15 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
uint8_t* dst_v,
int width) {
uint64_t src_rgb1;
- uint64_t ftmp[12];
+ uint64_t ftmp[13];
+ uint64_t tmp[1];
const uint64_t value = 0x4040;
- const uint64_t mask_u = 0x0026004a00700002;
- const uint64_t mask_v = 0x00020070005e0012;
+ const uint64_t mask_u = 0x0013002500380002;
+ const uint64_t mask_v = 0x00020038002f0009;
__asm__ volatile(
+ "dli %[tmp0], 0x0001000100010001 \n\t"
+ "dmtc1 %[tmp0], %[ftmp12] \n\t"
"1: \n\t"
"daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t"
"gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t"
@@ -1619,7 +1660,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_0 %[dest0_u], %[src0], %[value] \n\t"
"dsrl %[dest0_v], %[src0], %[sixteen] \n\t"
"pinsrh_3 %[dest0_v], %[dest0_v], %[value] \n\t"
@@ -1637,7 +1679,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_0 %[src_lo], %[src0], %[value] \n\t"
"dsrl %[src_hi], %[src0], %[sixteen] \n\t"
"pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
@@ -1664,7 +1707,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_0 %[dest1_u], %[src0], %[value] \n\t"
"dsrl %[dest1_v], %[src0], %[sixteen] \n\t"
"pinsrh_3 %[dest1_v], %[dest1_v], %[value] \n\t"
@@ -1682,7 +1726,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_0 %[src_lo], %[src0], %[value] \n\t"
"dsrl %[src_hi], %[src0], %[sixteen] \n\t"
"pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
@@ -1709,7 +1754,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_0 %[dest2_u], %[src0], %[value] \n\t"
"dsrl %[dest2_v], %[src0], %[sixteen] \n\t"
"pinsrh_3 %[dest2_v], %[dest2_v], %[value] \n\t"
@@ -1727,7 +1773,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_0 %[src_lo], %[src0], %[value] \n\t"
"dsrl %[src_hi], %[src0], %[sixteen] \n\t"
"pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
@@ -1754,7 +1801,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_0 %[dest3_u], %[src0], %[value] \n\t"
"dsrl %[dest3_v], %[src0], %[sixteen] \n\t"
"pinsrh_3 %[dest3_v], %[dest3_v], %[value] \n\t"
@@ -1772,7 +1820,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_0 %[src_lo], %[src0], %[value] \n\t"
"dsrl %[src_hi], %[src0], %[sixteen] \n\t"
"pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
@@ -1810,11 +1859,12 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
[dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
[dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
[dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
- [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
+ [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
+ [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
: [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
[dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
[mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
- [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
+ [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
[sixteen] "f"(0x10)
: "memory");
}
@@ -1908,12 +1958,15 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
uint8_t* dst_v,
int width) {
uint64_t src_rgb1;
- uint64_t ftmp[12];
+ uint64_t ftmp[13];
+ uint64_t tmp[1];
const uint64_t value = 0x4040;
- const uint64_t mask_u = 0x0026004a00700002;
- const uint64_t mask_v = 0x00020070005e0012;
+ const uint64_t mask_u = 0x0013002500380002;
+ const uint64_t mask_v = 0x00020038002f0009;
__asm__ volatile(
+ "dli %[tmp0], 0x0001000100010001 \n\t"
+ "dmtc1 %[tmp0], %[ftmp12] \n\t"
"1: \n\t"
"daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t"
"gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t"
@@ -1929,7 +1982,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
"dsll %[src1], %[src1], %[eight] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsll %[dest0_u], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t"
"pinsrh_3 %[dest0_v], %[src0], %[value] \n\t"
@@ -1949,7 +2003,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
"dsll %[src1], %[src1], %[eight] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsll %[src_lo], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
"pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
@@ -1978,7 +2033,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
"dsll %[src1], %[src1], %[eight] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsll %[dest1_u], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t"
"pinsrh_3 %[dest1_v], %[src0], %[value] \n\t"
@@ -1998,7 +2054,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
"dsll %[src1], %[src1], %[eight] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsll %[src_lo], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
"pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
@@ -2027,7 +2084,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
"dsll %[src1], %[src1], %[eight] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsll %[dest2_u], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t"
"pinsrh_3 %[dest2_v], %[src0], %[value] \n\t"
@@ -2047,7 +2105,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
"dsll %[src1], %[src1], %[eight] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsll %[src_lo], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
"pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
@@ -2076,7 +2135,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
"dsll %[src1], %[src1], %[eight] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsll %[dest3_u], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t"
"pinsrh_3 %[dest3_v], %[src0], %[value] \n\t"
@@ -2096,7 +2156,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
"dsll %[src1], %[src1], %[eight] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsll %[src_lo], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
"pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
@@ -2134,11 +2195,12 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
[dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
[dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
[dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
- [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
+ [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
+ [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
: [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
[dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
[mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
- [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
+ [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
[sixteen] "f"(0x10)
: "memory");
}
@@ -2232,12 +2294,15 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0,
uint8_t* dst_v,
int width) {
uint64_t src_rgb1;
- uint64_t ftmp[12];
+ uint64_t ftmp[13];
+ uint64_t tmp[1];
const uint64_t value = 0x4040;
- const uint64_t mask_u = 0x00020070004a0026;
- const uint64_t mask_v = 0x0012005e00700002;
+ const uint64_t mask_u = 0x0002003800250013;
+ const uint64_t mask_v = 0x0009002f00380002;
__asm__ volatile(
+ "dli %[tmp0], 0x0001000100010001 \n\t"
+ "dmtc1 %[tmp0], %[ftmp12] \n\t"
"1: \n\t"
"daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t"
"gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t"
@@ -2253,7 +2318,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0,
"dsll %[src1], %[src1], %[eight] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_3 %[dest0_u], %[src0], %[value] \n\t"
"dsll %[dest0_v], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[dest0_v], %[dest0_v], %[value] \n\t"
@@ -2273,7 +2339,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0,
"dsll %[src1], %[src1], %[eight] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
"dsll %[src_hi], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
@@ -2302,7 +2369,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0,
"dsll %[src1], %[src1], %[eight] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_3 %[dest1_u], %[src0], %[value] \n\t"
"dsll %[dest1_v], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[dest1_v], %[dest1_v], %[value] \n\t"
@@ -2322,7 +2390,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0,
"dsll %[src1], %[src1], %[eight] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
"dsll %[src_hi], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
@@ -2351,7 +2420,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0,
"dsll %[src1], %[src1], %[eight] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_3 %[dest2_u], %[src0], %[value] \n\t"
"dsll %[dest2_v], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[dest2_v], %[dest2_v], %[value] \n\t"
@@ -2371,7 +2441,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0,
"dsll %[src1], %[src1], %[eight] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
"dsll %[src_hi], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
@@ -2400,7 +2471,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0,
"dsll %[src1], %[src1], %[eight] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_3 %[dest3_u], %[src0], %[value] \n\t"
"dsll %[dest3_v], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[dest3_v], %[dest3_v], %[value] \n\t"
@@ -2420,7 +2492,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0,
"dsll %[src1], %[src1], %[eight] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
"dsll %[src_hi], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
@@ -2458,11 +2531,12 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0,
[dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
[dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
[dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
- [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
+ [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
+ [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
: [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
[dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
[mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
- [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
+ [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
[sixteen] "f"(0x10)
: "memory");
}
@@ -2471,10 +2545,10 @@ void ARGBToYJRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
uint64_t src, src_hi, src_lo;
uint64_t dest, dest0, dest1, dest2, dest3;
uint64_t tmp0, tmp1;
- const uint64_t shift = 0x07;
- const uint64_t value = 0x0040;
+ const uint64_t shift = 0x08;
+ const uint64_t value = 0x80;
const uint64_t mask0 = 0x0;
- const uint64_t mask1 = 0x00010026004B000FULL;
+ const uint64_t mask1 = 0x0001004D0096001DULL;
__asm__ volatile(
"1: \n\t"
@@ -2558,8 +2632,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
uint64_t src_rgb1;
uint64_t ftmp[12];
const uint64_t value = 0x4040;
- const uint64_t mask_u = 0x002b0054007f0002;
- const uint64_t mask_v = 0x0002007f006b0014;
+ const uint64_t mask_u = 0x0015002a003f0002;
+ const uint64_t mask_v = 0x0002003f0035000a;
__asm__ volatile(
"1: \n\t"
@@ -2572,8 +2646,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
"punpckhbh %[src_hi], %[src0], %[zero] \n\t"
"punpcklbh %[src0], %[src1], %[zero] \n\t"
"punpckhbh %[src1], %[src1], %[zero] \n\t"
- "pavgh %[src0], %[src_lo], %[src0] \n\t"
- "pavgh %[src1], %[src_hi], %[src1] \n\t"
+ "paddh %[src0], %[src_lo], %[src0] \n\t"
+ "paddh %[src1], %[src_hi], %[src1] \n\t"
"pavgh %[src0], %[src0], %[src1] \n\t"
"dsll %[dest0_u], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t"
@@ -2589,8 +2663,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
"punpckhbh %[src_hi], %[src0], %[zero] \n\t"
"punpcklbh %[src0], %[src1], %[zero] \n\t"
"punpckhbh %[src1], %[src1], %[zero] \n\t"
- "pavgh %[src0], %[src_lo], %[src0] \n\t"
- "pavgh %[src1], %[src_hi], %[src1] \n\t"
+ "paddh %[src0], %[src_lo], %[src0] \n\t"
+ "paddh %[src1], %[src_hi], %[src1] \n\t"
"pavgh %[src0], %[src0], %[src1] \n\t"
"dsll %[src_lo], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
@@ -2615,8 +2689,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
"punpckhbh %[src_hi], %[src0], %[zero] \n\t"
"punpcklbh %[src0], %[src1], %[zero] \n\t"
"punpckhbh %[src1], %[src1], %[zero] \n\t"
- "pavgh %[src0], %[src_lo], %[src0] \n\t"
- "pavgh %[src1], %[src_hi], %[src1] \n\t"
+ "paddh %[src0], %[src_lo], %[src0] \n\t"
+ "paddh %[src1], %[src_hi], %[src1] \n\t"
"pavgh %[src0], %[src0], %[src1] \n\t"
"dsll %[dest1_u], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t"
@@ -2632,8 +2706,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
"punpckhbh %[src_hi], %[src0], %[zero] \n\t"
"punpcklbh %[src0], %[src1], %[zero] \n\t"
"punpckhbh %[src1], %[src1], %[zero] \n\t"
- "pavgh %[src0], %[src_lo], %[src0] \n\t"
- "pavgh %[src1], %[src_hi], %[src1] \n\t"
+ "paddh %[src0], %[src_lo], %[src0] \n\t"
+ "paddh %[src1], %[src_hi], %[src1] \n\t"
"pavgh %[src0], %[src0], %[src1] \n\t"
"dsll %[src_lo], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
@@ -2658,8 +2732,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
"punpckhbh %[src_hi], %[src0], %[zero] \n\t"
"punpcklbh %[src0], %[src1], %[zero] \n\t"
"punpckhbh %[src1], %[src1], %[zero] \n\t"
- "pavgh %[src0], %[src_lo], %[src0] \n\t"
- "pavgh %[src1], %[src_hi], %[src1] \n\t"
+ "paddh %[src0], %[src_lo], %[src0] \n\t"
+ "paddh %[src1], %[src_hi], %[src1] \n\t"
"pavgh %[src0], %[src0], %[src1] \n\t"
"dsll %[dest2_u], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t"
@@ -2675,8 +2749,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
"punpckhbh %[src_hi], %[src0], %[zero] \n\t"
"punpcklbh %[src0], %[src1], %[zero] \n\t"
"punpckhbh %[src1], %[src1], %[zero] \n\t"
- "pavgh %[src0], %[src_lo], %[src0] \n\t"
- "pavgh %[src1], %[src_hi], %[src1] \n\t"
+ "paddh %[src0], %[src_lo], %[src0] \n\t"
+ "paddh %[src1], %[src_hi], %[src1] \n\t"
"pavgh %[src0], %[src0], %[src1] \n\t"
"dsll %[src_lo], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
@@ -2701,8 +2775,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
"punpckhbh %[src_hi], %[src0], %[zero] \n\t"
"punpcklbh %[src0], %[src1], %[zero] \n\t"
"punpckhbh %[src1], %[src1], %[zero] \n\t"
- "pavgh %[src0], %[src_lo], %[src0] \n\t"
- "pavgh %[src1], %[src_hi], %[src1] \n\t"
+ "paddh %[src0], %[src_lo], %[src0] \n\t"
+ "paddh %[src1], %[src_hi], %[src1] \n\t"
"pavgh %[src0], %[src0], %[src1] \n\t"
"dsll %[dest3_u], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t"
@@ -2718,8 +2792,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
"punpckhbh %[src_hi], %[src0], %[zero] \n\t"
"punpcklbh %[src0], %[src1], %[zero] \n\t"
"punpckhbh %[src1], %[src1], %[zero] \n\t"
- "pavgh %[src0], %[src_lo], %[src0] \n\t"
- "pavgh %[src1], %[src_hi], %[src1] \n\t"
+ "paddh %[src0], %[src_lo], %[src0] \n\t"
+ "paddh %[src1], %[src_hi], %[src1] \n\t"
"pavgh %[src0], %[src0], %[src1] \n\t"
"dsll %[src_lo], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
@@ -2762,7 +2836,7 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
: [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
[dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
[mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
- [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
+ [zero] "f"(0x00), [eight] "f"(0x08),
[sixteen] "f"(0x10)
: "memory");
}
@@ -4052,10 +4126,10 @@ void ARGBGrayRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
uint64_t tmp0, tmp1;
const uint64_t mask0 = 0x0;
const uint64_t mask1 = 0x01;
- const uint64_t mask2 = 0x00400026004B000FULL;
+ const uint64_t mask2 = 0x0080004D0096001DULL;
const uint64_t mask3 = 0xFF000000FF000000ULL;
const uint64_t mask4 = ~mask3;
- const uint64_t shift = 0x07;
+ const uint64_t shift = 0x08;
__asm__ volatile(
"1: \n\t"
@@ -4778,7 +4852,9 @@ void J400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, int width) {
: "memory");
}
-void I400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* rgb_buf, int width) {
+// TODO - respect YuvConstants
+void I400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* rgb_buf,
+ const struct YuvConstants*, int width) {
uint64_t src, src_lo, src_hi, dest, dest_lo, dest_hi;
const uint64_t mask0 = 0x0;
const uint64_t mask1 = 0x55;
@@ -4912,10 +4988,10 @@ void MirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
: "memory");
}
-void MirrorUVRow_MMI(const uint8_t* src_uv,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
+void MirrorSplitUVRow_MMI(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
uint64_t src0, src1, dest0, dest1;
const uint64_t mask0 = 0x00ff00ff00ff00ffULL;
const uint64_t mask1 = 0x1b;
@@ -6040,90 +6116,93 @@ void I444ToARGBRow_MMI(const uint8_t* src_y,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) {
- uint64_t y, u, v;
- uint64_t b_vec[2], g_vec[2], r_vec[2];
+ uint64_t y,u,v;
+ uint64_t b_vec[2],g_vec[2],r_vec[2];
uint64_t mask = 0xff00ff00ff00ff00ULL;
- uint64_t ub, ug, vg, vr, bb, bg, br, yg;
- __asm__ volatile(
- "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" // yg
- "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" // bb
- "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" // ub
- "or %[ub], %[ub], %[mask] \n\t" // must
- // sign
- // extension
- "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" // bg
- "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" // ug
- "punpcklbh %[ug], %[ug], %[zero] \n\t"
- "pshufh %[ug], %[ug], %[zero] \n\t"
- "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" // vg
- "punpcklbh %[vg], %[vg], %[zero] \n\t"
- "pshufh %[vg], %[vg], %[five] \n\t"
- "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" // br
- "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" // vr
- "punpcklbh %[vr], %[vr], %[zero] \n\t"
- "pshufh %[vr], %[vr], %[five] \n\t"
- "or %[vr], %[vr], %[mask] \n\t" // sign
- // extension
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+ __asm__ volatile (
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"//yg
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"//bb
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"//ub
+ "or %[ub], %[ub], %[mask] \n\t"//must sign extension
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"//bg
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"//ug
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"//vg
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"//br
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"//vr
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask] \n\t"//sign extension
- "1: \n\t"
- "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
- "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
- "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
- "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
- "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
- "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
+ "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
+ "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
- "punpcklbh %[y], %[y], %[y] \n\t" // y*0x0101
- "pmulhuh %[y], %[y], %[yg] \n\t" // y1
+ "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101
+ "pmulhuh %[y], %[y], %[yg] \n\t"//y1
- "punpcklbh %[u], %[u], %[zero] \n\t" // u
- "paddsh %[b_vec0], %[y], %[bb] \n\t"
- "pmullh %[b_vec1], %[u], %[ub] \n\t"
- "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t"
- "psrah %[b_vec0], %[b_vec0], %[six] \n\t"
+ "punpcklbh %[u], %[u], %[zero] \n\t"//u
+ "paddsh %[b_vec0], %[y], %[bb] \n\t"
+ "pmullh %[b_vec1], %[u], %[ub] \n\t"
+ "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t"
+ "psrah %[b_vec0], %[b_vec0], %[six] \n\t"
- "punpcklbh %[v], %[v], %[zero] \n\t" // v
- "paddsh %[g_vec0], %[y], %[bg] \n\t"
- "pmullh %[g_vec1], %[u], %[ug] \n\t" // u*ug
- "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
- "pmullh %[g_vec1], %[v], %[vg] \n\t" // v*vg
- "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
- "psrah %[g_vec0], %[g_vec0], %[six] \n\t"
+ "punpcklbh %[v], %[v], %[zero] \n\t"//v
+ "paddsh %[g_vec0], %[y], %[bg] \n\t"
+ "pmullh %[g_vec1], %[u], %[ug] \n\t"//u*ug
+ "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
+ "pmullh %[g_vec1], %[v], %[vg] \n\t"//v*vg
+ "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
+ "psrah %[g_vec0], %[g_vec0], %[six] \n\t"
- "paddsh %[r_vec0], %[y], %[br] \n\t"
- "pmullh %[r_vec1], %[v], %[vr] \n\t" // v*vr
- "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t"
- "psrah %[r_vec0], %[r_vec0], %[six] \n\t"
+ "paddsh %[r_vec0], %[y], %[br] \n\t"
+ "pmullh %[r_vec1], %[v], %[vr] \n\t"//v*vr
+ "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t"
+ "psrah %[r_vec0], %[r_vec0], %[six] \n\t"
- "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" // rrrrbbbb
- "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t" // ffffgggg
- "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t"
- "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" // gbgbgbgb
- "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" // frfrfrfr
- "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" // frgbfrgb
- "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" // frgbfrgb
- "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t"
- "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t"
- "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t"
- "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t"
+ "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"//rrrrbbbb
+ "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t"//ffffgggg
+ "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t"
+ "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"//gbgbgbgb
+ "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"//frfrfrfr
+ "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb
+ "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb
+ "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t"
- "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
- "daddiu %[u_ptr], %[u_ptr], 0x04 \n\t"
- "daddiu %[v_ptr], %[v_ptr], 0x04 \n\t"
- "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
- : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec0] "=&f"(b_vec[0]),
- [b_vec1] "=&f"(b_vec[1]), [g_vec0] "=&f"(g_vec[0]),
- [g_vec1] "=&f"(g_vec[1]), [r_vec0] "=&f"(r_vec[0]),
- [r_vec1] "=&f"(r_vec[1]), [ub] "=&f"(ub), [ug] "=&f"(ug),
- [vg] "=&f"(vg), [vr] "=&f"(vr), [bb] "=&f"(bb), [bg] "=&f"(bg),
- [br] "=&f"(br), [yg] "=&f"(yg)
- : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v),
- [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants),
- [width] "r"(width), [zero] "f"(0x00), [alpha] "f"(-1), [six] "f"(0x6),
- [five] "f"(0x55), [mask] "f"(mask)
- : "memory");
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[u_ptr], %[u_ptr], 0x04 \n\t"
+ "daddiu %[v_ptr], %[v_ptr], 0x04 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [y]"=&f"(y),
+ [u]"=&f"(u), [v]"=&f"(v),
+ [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]),
+ [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]),
+ [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u),
+ [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [alpha]"f"(-1),
+ [six]"f"(0x6), [five]"f"(0x55),
+ [mask]"f"(mask)
+ : "memory"
+ );
}
// Also used for 420
@@ -6133,96 +6212,99 @@ void I422ToARGBRow_MMI(const uint8_t* src_y,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) {
- uint64_t y, u, v;
- uint64_t b_vec[2], g_vec[2], r_vec[2];
+ uint64_t y,u,v;
+ uint64_t b_vec[2],g_vec[2],r_vec[2];
uint64_t mask = 0xff00ff00ff00ff00ULL;
- uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
__asm__ volatile(
- "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" // yg
- "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" // bb
- "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" // ub
- "or %[ub], %[ub], %[mask] \n\t" // must
- // sign
- // extension
- "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" // bg
- "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" // ug
- "punpcklbh %[ug], %[ug], %[zero] \n\t"
- "pshufh %[ug], %[ug], %[zero] \n\t"
- "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" // vg
- "punpcklbh %[vg], %[vg], %[zero] \n\t"
- "pshufh %[vg], %[vg], %[five] \n\t"
- "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" // br
- "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" // vr
- "punpcklbh %[vr], %[vr], %[zero] \n\t"
- "pshufh %[vr], %[vr], %[five] \n\t"
- "or %[vr], %[vr], %[mask] \n\t" // sign
- // extension
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"//yg
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"//bb
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"//ub
+ "or %[ub], %[ub], %[mask] \n\t"//must sign extension
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"//bg
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"//ug
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"//vg
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"//br
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"//vr
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask] \n\t"//sign extension
- "1: \n\t"
- "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
- "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
- "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
- "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
- "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
- "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
+ "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
+ "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
- "punpcklbh %[y], %[y], %[y] \n\t" // y*0x0101
- "pmulhuh %[y], %[y], %[yg] \n\t" // y1
+ "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101
+ "pmulhuh %[y], %[y], %[yg] \n\t"//y1
- // u3|u2|u1|u0 --> u1|u1|u0|u0
- "punpcklbh %[u], %[u], %[u] \n\t" // u
- "punpcklbh %[u], %[u], %[zero] \n\t"
- "paddsh %[b_vec0], %[y], %[bb] \n\t"
- "pmullh %[b_vec1], %[u], %[ub] \n\t"
- "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t"
- "psrah %[b_vec0], %[b_vec0], %[six] \n\t"
+ //u3|u2|u1|u0 --> u1|u1|u0|u0
+ "punpcklbh %[u], %[u], %[u] \n\t"//u
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "paddsh %[b_vec0], %[y], %[bb] \n\t"
+ "pmullh %[b_vec1], %[u], %[ub] \n\t"
+ "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t"
+ "psrah %[b_vec0], %[b_vec0], %[six] \n\t"
- // v3|v2|v1|v0 --> v1|v1|v0|v0
- "punpcklbh %[v], %[v], %[v] \n\t" // v
- "punpcklbh %[v], %[v], %[zero] \n\t"
- "paddsh %[g_vec0], %[y], %[bg] \n\t"
- "pmullh %[g_vec1], %[u], %[ug] \n\t" // u*ug
- "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
- "pmullh %[g_vec1], %[v], %[vg] \n\t" // v*vg
- "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
- "psrah %[g_vec0], %[g_vec0], %[six] \n\t"
+ //v3|v2|v1|v0 --> v1|v1|v0|v0
+ "punpcklbh %[v], %[v], %[v] \n\t"//v
+ "punpcklbh %[v], %[v], %[zero] \n\t"
+ "paddsh %[g_vec0], %[y], %[bg] \n\t"
+ "pmullh %[g_vec1], %[u], %[ug] \n\t"//u*ug
+ "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
+ "pmullh %[g_vec1], %[v], %[vg] \n\t"//v*vg
+ "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
+ "psrah %[g_vec0], %[g_vec0], %[six] \n\t"
- "paddsh %[r_vec0], %[y], %[br] \n\t"
- "pmullh %[r_vec1], %[v], %[vr] \n\t" // v*vr
- "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t"
- "psrah %[r_vec0], %[r_vec0], %[six] \n\t"
+ "paddsh %[r_vec0], %[y], %[br] \n\t"
+ "pmullh %[r_vec1], %[v], %[vr] \n\t"//v*vr
+ "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t"
+ "psrah %[r_vec0], %[r_vec0], %[six] \n\t"
- "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" // rrrrbbbb
- "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t" // ffffgggg
- "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t"
- "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" // gbgbgbgb
- "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" // frfrfrfr
- "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" // frgbfrgb
- "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" // frgbfrgb
- "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t"
- "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t"
- "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t"
- "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t"
+ "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"//rrrrbbbb
+ "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t"//ffffgggg
+ "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t"
+ "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"//gbgbgbgb
+ "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"//frfrfrfr
+ "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb
+ "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb
+ "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t"
- "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
- "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t"
- "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t"
- "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t"
+ "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
- : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec0] "=&f"(b_vec[0]),
- [b_vec1] "=&f"(b_vec[1]), [g_vec0] "=&f"(g_vec[0]),
- [g_vec1] "=&f"(g_vec[1]), [r_vec0] "=&f"(r_vec[0]),
- [r_vec1] "=&f"(r_vec[1]), [ub] "=&f"(ub), [ug] "=&f"(ug),
- [vg] "=&f"(vg), [vr] "=&f"(vr), [bb] "=&f"(bb), [bg] "=&f"(bg),
- [br] "=&f"(br), [yg] "=&f"(yg)
- : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v),
- [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants),
- [width] "r"(width), [zero] "f"(0x00), [alpha] "f"(-1), [six] "f"(0x6),
- [five] "f"(0x55), [mask] "f"(mask)
- : "memory");
+ : [y]"=&f"(y),
+ [u]"=&f"(u), [v]"=&f"(v),
+ [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]),
+ [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]),
+ [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u),
+ [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [alpha]"f"(-1),
+ [six]"f"(0x6), [five]"f"(0x55),
+ [mask]"f"(mask)
+ : "memory"
+ );
}
// 10 bit YUV to ARGB
@@ -6232,96 +6314,102 @@ void I210ToARGBRow_MMI(const uint16_t* src_y,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) {
- uint64_t y, u, v;
- uint64_t b_vec[2], g_vec[2], r_vec[2];
+ uint64_t y,u,v;
+ uint64_t b_vec[2],g_vec[2],r_vec[2];
uint64_t mask = 0xff00ff00ff00ff00ULL;
- uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
__asm__ volatile(
- "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
- "or %[ub], %[ub], %[mask] \n\t"
- "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[ug], %[ug], %[zero] \n\t"
- "pshufh %[ug], %[ug], %[zero] \n\t"
- "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vg], %[vg], %[zero] \n\t"
- "pshufh %[vg], %[vg], %[five] \n\t"
- "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vr], %[vr], %[zero] \n\t"
- "pshufh %[vr], %[vr], %[five] \n\t"
- "or %[vr], %[vr], %[mask] \n\t"
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask] \n\t"
- "1: \n\t"
- "gsldlc1 %[y], 0x07(%[y_ptr]) \n\t"
- "gsldrc1 %[y], 0x00(%[y_ptr]) \n\t"
- "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
- "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
- "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
- "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
+ "1: \n\t"
+ "gsldlc1 %[y], 0x07(%[y_ptr]) \n\t"
+ "gsldrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
+ "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
+ "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
- "psllh %[y], %[y], %[six] \n\t"
- "pmulhuh %[y], %[y], %[yg] \n\t"
+ "psllh %[y], %[y], %[six] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
- "punpcklhw %[u], %[u], %[u] \n\t"
- "psrah %[u], %[u], %[two] \n\t"
- "punpcklhw %[v], %[v], %[v] \n\t"
- "psrah %[v], %[v], %[two] \n\t"
- "pminsh %[u], %[u], %[mask1] \n\t"
- "pminsh %[v], %[v], %[mask1] \n\t"
+ "punpcklhw %[u], %[u], %[u] \n\t"
+ "psrah %[u], %[u], %[two] \n\t"
+ "punpcklhw %[v], %[v], %[v] \n\t"
+ "psrah %[v], %[v], %[two] \n\t"
+ "pminsh %[u], %[u], %[mask1] \n\t"
+ "pminsh %[v], %[v], %[mask1] \n\t"
- "paddsh %[b_vec0], %[y], %[bb] \n\t"
- "pmullh %[b_vec1], %[u], %[ub] \n\t"
- "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t"
+ "paddsh %[b_vec0], %[y], %[bb] \n\t"
+ "pmullh %[b_vec1], %[u], %[ub] \n\t"
+ "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t"
- "paddsh %[g_vec0], %[y], %[bg] \n\t"
- "pmullh %[g_vec1], %[u], %[ug] \n\t"
- "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
- "pmullh %[g_vec1], %[v], %[vg] \n\t"
- "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
+ "paddsh %[g_vec0], %[y], %[bg] \n\t"
+ "pmullh %[g_vec1], %[u], %[ug] \n\t"
+ "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
+ "pmullh %[g_vec1], %[v], %[vg] \n\t"
+ "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
- "paddsh %[r_vec0], %[y], %[br] \n\t"
- "pmullh %[r_vec1], %[v], %[vr] \n\t"
- "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t"
+ "paddsh %[r_vec0], %[y], %[br] \n\t"
+ "pmullh %[r_vec1], %[v], %[vr] \n\t"
+ "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t"
- "psrah %[b_vec0], %[b_vec0], %[six] \n\t"
- "psrah %[g_vec0], %[g_vec0], %[six] \n\t"
- "psrah %[r_vec0], %[r_vec0], %[six] \n\t"
+ "psrah %[b_vec0], %[b_vec0], %[six] \n\t"
+ "psrah %[g_vec0], %[g_vec0], %[six] \n\t"
+ "psrah %[r_vec0], %[r_vec0], %[six] \n\t"
- "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"
- "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t"
- "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t"
- "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"
- "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"
- "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"
- "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"
- "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t"
- "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t"
- "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t"
- "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t"
+ "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"
+ "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t"
+ "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t"
+ "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"
+ "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"
+ "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"
+ "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"
+ "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t"
- "daddiu %[y_ptr], %[y_ptr], 0x08 \n\t"
- "daddiu %[u_ptr], %[u_ptr], 0x04 \n\t"
- "daddiu %[v_ptr], %[v_ptr], 0x04 \n\t"
- "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
+ "daddiu %[y_ptr], %[y_ptr], 0x08 \n\t"
+ "daddiu %[u_ptr], %[u_ptr], 0x04 \n\t"
+ "daddiu %[v_ptr], %[v_ptr], 0x04 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
- : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec0] "=&f"(b_vec[0]),
- [b_vec1] "=&f"(b_vec[1]), [g_vec0] "=&f"(g_vec[0]),
- [g_vec1] "=&f"(g_vec[1]), [r_vec0] "=&f"(r_vec[0]),
- [r_vec1] "=&f"(r_vec[1]), [ub] "=&f"(ub), [ug] "=&f"(ug),
- [vg] "=&f"(vg), [vr] "=&f"(vr), [bb] "=&f"(bb), [bg] "=&f"(bg),
- [br] "=&f"(br), [yg] "=&f"(yg)
- : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v),
- [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants),
- [width] "r"(width), [zero] "f"(0x00), [alpha] "f"(-1), [six] "f"(0x6),
- [five] "f"(0x55), [mask] "f"(mask), [two] "f"(0x02),
- [mask1] "f"(0x00ff00ff00ff00ff)
- : "memory");
+ : [y]"=&f"(y),
+ [u]"=&f"(u), [v]"=&f"(v),
+ [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]),
+ [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]),
+ [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u),
+ [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [alpha]"f"(-1),
+ [six]"f"(0x6), [five]"f"(0x55),
+ [mask]"f"(mask), [two]"f"(0x02),
+ [mask1]"f"(0x00ff00ff00ff00ff)
+ : "memory"
+ );
}
void I422AlphaToARGBRow_MMI(const uint8_t* src_y,
@@ -6331,96 +6419,102 @@ void I422AlphaToARGBRow_MMI(const uint8_t* src_y,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) {
- uint64_t y, u, v, a;
- uint64_t b_vec[2], g_vec[2], r_vec[2];
+ uint64_t y,u,v,a;
+ uint64_t b_vec[2],g_vec[2],r_vec[2];
uint64_t mask = 0xff00ff00ff00ff00ULL;
- uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
__asm__ volatile(
- "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
- "or %[ub], %[ub], %[mask] \n\t"
- "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[ug], %[ug], %[zero] \n\t"
- "pshufh %[ug], %[ug], %[zero] \n\t"
- "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vg], %[vg], %[zero] \n\t"
- "pshufh %[vg], %[vg], %[five] \n\t"
- "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vr], %[vr], %[zero] \n\t"
- "pshufh %[vr], %[vr], %[five] \n\t"
- "or %[vr], %[vr], %[mask] \n\t"
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask] \n\t"
- "1: \n\t"
- "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
- "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
- "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
- "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
- "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
- "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
- "gslwlc1 %[a], 0x03(%[a_ptr]) \n\t"
- "gslwrc1 %[a], 0x00(%[a_ptr]) \n\t"
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
+ "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
+ "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
+ "gslwlc1 %[a], 0x03(%[a_ptr]) \n\t"
+ "gslwrc1 %[a], 0x00(%[a_ptr]) \n\t"
- "punpcklbh %[y], %[y], %[y] \n\t" // y*0x0101
- "pmulhuh %[y], %[y], %[yg] \n\t" // y1
+ "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101
+ "pmulhuh %[y], %[y], %[yg] \n\t"//y1
- // u3|u2|u1|u0 --> u1|u1|u0|u0
- "punpcklbh %[u], %[u], %[u] \n\t" // u
- "punpcklbh %[u], %[u], %[zero] \n\t"
- "paddsh %[b_vec0], %[y], %[bb] \n\t"
- "pmullh %[b_vec1], %[u], %[ub] \n\t"
- "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t"
- "psrah %[b_vec0], %[b_vec0], %[six] \n\t"
+ //u3|u2|u1|u0 --> u1|u1|u0|u0
+ "punpcklbh %[u], %[u], %[u] \n\t"//u
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "paddsh %[b_vec0], %[y], %[bb] \n\t"
+ "pmullh %[b_vec1], %[u], %[ub] \n\t"
+ "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t"
+ "psrah %[b_vec0], %[b_vec0], %[six] \n\t"
- // v3|v2|v1|v0 --> v1|v1|v0|v0
- "punpcklbh %[v], %[v], %[v] \n\t"
- "punpcklbh %[v], %[v], %[zero] \n\t"
- "paddsh %[g_vec0], %[y], %[bg] \n\t"
- "pmullh %[g_vec1], %[u], %[ug] \n\t"
- "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
- "pmullh %[g_vec1], %[v], %[vg] \n\t"
- "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
- "psrah %[g_vec0], %[g_vec0], %[six] \n\t"
+ //v3|v2|v1|v0 --> v1|v1|v0|v0
+ "punpcklbh %[v], %[v], %[v] \n\t"
+ "punpcklbh %[v], %[v], %[zero] \n\t"
+ "paddsh %[g_vec0], %[y], %[bg] \n\t"
+ "pmullh %[g_vec1], %[u], %[ug] \n\t"
+ "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
+ "pmullh %[g_vec1], %[v], %[vg] \n\t"
+ "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
+ "psrah %[g_vec0], %[g_vec0], %[six] \n\t"
- "paddsh %[r_vec0], %[y], %[br] \n\t"
- "pmullh %[r_vec1], %[v], %[vr] \n\t"
- "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t"
- "psrah %[r_vec0], %[r_vec0], %[six] \n\t"
+ "paddsh %[r_vec0], %[y], %[br] \n\t"
+ "pmullh %[r_vec1], %[v], %[vr] \n\t"
+ "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t"
+ "psrah %[r_vec0], %[r_vec0], %[six] \n\t"
- "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" // rrrrbbbb
- "packushb %[g_vec0], %[g_vec0], %[a] \n\t"
- "punpcklwd %[g_vec0], %[g_vec0], %[a] \n\t" // aaaagggg
- "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"
- "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"
- "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"
- "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"
- "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t"
- "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t"
- "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t"
- "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t"
+ "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"//rrrrbbbb
+ "packushb %[g_vec0], %[g_vec0], %[a] \n\t"
+ "punpcklwd %[g_vec0], %[g_vec0], %[a] \n\t"//aaaagggg
+ "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"
+ "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"
+ "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"
+ "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"
+ "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t"
- "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
- "daddiu %[a_ptr], %[a_ptr], 0x04 \n\t"
- "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t"
- "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t"
- "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[a_ptr], %[a_ptr], 0x04 \n\t"
+ "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t"
+ "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
- : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [a] "=&f"(a),
- [b_vec0] "=&f"(b_vec[0]), [b_vec1] "=&f"(b_vec[1]),
- [g_vec0] "=&f"(g_vec[0]), [g_vec1] "=&f"(g_vec[1]),
- [r_vec0] "=&f"(r_vec[0]), [r_vec1] "=&f"(r_vec[1]), [ub] "=&f"(ub),
- [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), [bb] "=&f"(bb),
- [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
- : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v),
- [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants),
- [width] "r"(width), [a_ptr] "r"(src_a), [zero] "f"(0x00),
- [six] "f"(0x6), [five] "f"(0x55), [mask] "f"(mask)
- : "memory");
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v), [a]"=&f"(a),
+ [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]),
+ [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]),
+ [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u),
+ [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [a_ptr]"r"(src_a), [zero]"f"(0x00),
+ [six]"f"(0x6), [five]"f"(0x55),
+ [mask]"f"(mask)
+ : "memory"
+ );
}
void I422ToRGB24Row_MMI(const uint8_t* src_y,
@@ -6429,105 +6523,113 @@ void I422ToRGB24Row_MMI(const uint8_t* src_y,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) {
- uint64_t y, u, v;
- uint64_t b_vec[2], g_vec[2], r_vec[2];
+ uint64_t y,u,v;
+ uint64_t b_vec[2],g_vec[2],r_vec[2];
uint64_t mask = 0xff00ff00ff00ff00ULL;
- uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
__asm__ volatile(
- "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
- "or %[ub], %[ub], %[mask] \n\t"
- "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[ug], %[ug], %[zero] \n\t"
- "pshufh %[ug], %[ug], %[zero] \n\t"
- "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vg], %[vg], %[zero] \n\t"
- "pshufh %[vg], %[vg], %[five] \n\t"
- "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vr], %[vr], %[zero] \n\t"
- "pshufh %[vr], %[vr], %[five] \n\t"
- "or %[vr], %[vr], %[mask] \n\t"
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask] \n\t"
- "1: \n\t"
- "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
- "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
- "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
- "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
- "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
- "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
+ "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
+ "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
- "punpcklbh %[y], %[y], %[y] \n\t" // y*0x0101
- "pmulhuh %[y], %[y], %[yg] \n\t" // y1
+ "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101
+ "pmulhuh %[y], %[y], %[yg] \n\t"//y1
- // u3|u2|u1|u0 --> u1|u1|u0|u0
- "punpcklbh %[u], %[u], %[u] \n\t" // u
- "punpcklbh %[u], %[u], %[zero] \n\t"
- "paddsh %[b_vec0], %[y], %[bb] \n\t"
- "pmullh %[b_vec1], %[u], %[ub] \n\t"
- "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t"
- "psrah %[b_vec0], %[b_vec0], %[six] \n\t"
+ //u3|u2|u1|u0 --> u1|u1|u0|u0
+ "punpcklbh %[u], %[u], %[u] \n\t"//u
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "paddsh %[b_vec0], %[y], %[bb] \n\t"
+ "pmullh %[b_vec1], %[u], %[ub] \n\t"
+ "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t"
+ "psrah %[b_vec0], %[b_vec0], %[six] \n\t"
- // v3|v2|v1|v0 --> v1|v1|v0|v0
- "punpcklbh %[v], %[v], %[v] \n\t"
- "punpcklbh %[v], %[v], %[zero] \n\t"
- "paddsh %[g_vec0], %[y], %[bg] \n\t"
- "pmullh %[g_vec1], %[u], %[ug] \n\t"
- "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
- "pmullh %[g_vec1], %[v], %[vg] \n\t"
- "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
- "psrah %[g_vec0], %[g_vec0], %[six] \n\t"
+ //v3|v2|v1|v0 --> v1|v1|v0|v0
+ "punpcklbh %[v], %[v], %[v] \n\t"
+ "punpcklbh %[v], %[v], %[zero] \n\t"
+ "paddsh %[g_vec0], %[y], %[bg] \n\t"
+ "pmullh %[g_vec1], %[u], %[ug] \n\t"
+ "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
+ "pmullh %[g_vec1], %[v], %[vg] \n\t"
+ "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
+ "psrah %[g_vec0], %[g_vec0], %[six] \n\t"
- "paddsh %[r_vec0], %[y], %[br] \n\t"
- "pmullh %[r_vec1], %[v], %[vr] \n\t"
- "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t"
- "psrah %[r_vec0], %[r_vec0], %[six] \n\t"
+ "paddsh %[r_vec0], %[y], %[br] \n\t"
+ "pmullh %[r_vec1], %[v], %[vr] \n\t"
+ "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t"
+ "psrah %[r_vec0], %[r_vec0], %[six] \n\t"
- "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"
- "packushb %[g_vec0], %[g_vec0], %[zero] \n\t"
- "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"
- "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"
- "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"
- "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"
+ "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"
+ "packushb %[g_vec0], %[g_vec0], %[zero] \n\t"
+ "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"
+ "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"
+ "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"
+ "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"
- "punpckhwd %[r_vec0], %[g_vec0], %[g_vec0] \n\t"
- "psllw %[r_vec1], %[r_vec0], %[lmove1] \n\t"
- "or %[g_vec0], %[g_vec0], %[r_vec1] \n\t"
- "psrlw %[r_vec1], %[r_vec0], %[rmove1] \n\t"
- "pextrh %[r_vec1], %[r_vec1], %[zero] \n\t"
- "pinsrh_2 %[g_vec0], %[g_vec0], %[r_vec1] \n\t"
- "pextrh %[r_vec1], %[g_vec1], %[zero] \n\t"
- "pinsrh_3 %[g_vec0], %[g_vec0], %[r_vec1] \n\t"
- "pextrh %[r_vec1], %[g_vec1], %[one] \n\t"
- "punpckhwd %[g_vec1], %[g_vec1], %[g_vec1] \n\t"
- "psllw %[g_vec1], %[g_vec1], %[rmove1] \n\t"
- "or %[g_vec1], %[g_vec1], %[r_vec1] \n\t"
- "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t"
- "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t"
- "gsswlc1 %[g_vec1], 0x0b(%[rgbbuf_ptr]) \n\t"
- "gsswrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t"
+ "punpckhwd %[r_vec0], %[g_vec0], %[g_vec0] \n\t"
+ "psllw %[r_vec1], %[r_vec0], %[lmove1] \n\t"
+ "or %[g_vec0], %[g_vec0], %[r_vec1] \n\t"
+ "psrlw %[r_vec1], %[r_vec0], %[rmove1] \n\t"
+ "pextrh %[r_vec1], %[r_vec1], %[zero] \n\t"
+ "pinsrh_2 %[g_vec0], %[g_vec0], %[r_vec1] \n\t"
+ "pextrh %[r_vec1], %[g_vec1], %[zero] \n\t"
+ "pinsrh_3 %[g_vec0], %[g_vec0], %[r_vec1] \n\t"
+ "pextrh %[r_vec1], %[g_vec1], %[one] \n\t"
+ "punpckhwd %[g_vec1], %[g_vec1], %[g_vec1] \n\t"
+ "psllw %[g_vec1], %[g_vec1], %[rmove1] \n\t"
+ "or %[g_vec1], %[g_vec1], %[r_vec1] \n\t"
+ "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gsswlc1 %[g_vec1], 0x0b(%[rgbbuf_ptr]) \n\t"
+ "gsswrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t"
- "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
- "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t"
- "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t"
- "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0c \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
- : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec0] "=&f"(b_vec[0]),
- [b_vec1] "=&f"(b_vec[1]), [g_vec0] "=&f"(g_vec[0]),
- [g_vec1] "=&f"(g_vec[1]), [r_vec0] "=&f"(r_vec[0]),
- [r_vec1] "=&f"(r_vec[1]), [ub] "=&f"(ub), [ug] "=&f"(ug),
- [vg] "=&f"(vg), [vr] "=&f"(vr), [bb] "=&f"(bb), [bg] "=&f"(bg),
- [br] "=&f"(br), [yg] "=&f"(yg)
- : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v),
- [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants),
- [width] "r"(width), [zero] "f"(0x00), [five] "f"(0x55), [six] "f"(0x6),
- [mask] "f"(mask), [lmove1] "f"(0x18), [rmove1] "f"(0x8), [one] "f"(0x1)
- : "memory");
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t"
+ "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0c \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]),
+ [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]),
+ [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u),
+ [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask]"f"(mask),
+ [lmove1]"f"(0x18), [rmove1]"f"(0x8),
+ [one]"f"(0x1)
+ : "memory"
+ );
}
void I422ToARGB4444Row_MMI(const uint8_t* src_y,
@@ -6538,103 +6640,110 @@ void I422ToARGB4444Row_MMI(const uint8_t* src_y,
int width) {
uint64_t y, u, v;
uint64_t b_vec, g_vec, r_vec, temp;
- uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
__asm__ volatile(
- "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
- "or %[ub], %[ub], %[mask] \n\t"
- "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[ug], %[ug], %[zero] \n\t"
- "pshufh %[ug], %[ug], %[zero] \n\t"
- "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vg], %[vg], %[zero] \n\t"
- "pshufh %[vg], %[vg], %[five] \n\t"
- "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vr], %[vr], %[zero] \n\t"
- "pshufh %[vr], %[vr], %[five] \n\t"
- "or %[vr], %[vr], %[mask] \n\t"
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask] \n\t"
- "1: \n\t"
- "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
- "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
- "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
- "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
- "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
- "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
+ "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
+ "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
- "punpcklbh %[y], %[y], %[y] \n\t" // y*0x0101
- "pmulhuh %[y], %[y], %[yg] \n\t" // y1
+ "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101
+ "pmulhuh %[y], %[y], %[yg] \n\t"//y1
- // u3|u2|u1|u0 --> u1|u1|u0|u0
- "punpcklbh %[u], %[u], %[u] \n\t" // u
- "punpcklbh %[u], %[u], %[zero] \n\t"
- "paddsh %[b_vec], %[y], %[bb] \n\t"
- "pmullh %[temp], %[u], %[ub] \n\t"
- "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
- "psrah %[b_vec], %[b_vec], %[six] \n\t"
+ //u3|u2|u1|u0 --> u1|u1|u0|u0
+ "punpcklbh %[u], %[u], %[u] \n\t"//u
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
- // v3|v2|v1|v0 --> v1|v1|v0|v0
- "punpcklbh %[v], %[v], %[v] \n\t"
- "punpcklbh %[v], %[v], %[zero] \n\t"
- "paddsh %[g_vec], %[y], %[bg] \n\t"
- "pmullh %[temp], %[u], %[ug] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "pmullh %[temp], %[v], %[vg] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "psrah %[g_vec], %[g_vec], %[six] \n\t"
+ //v3|v2|v1|v0 --> v1|v1|v0|v0
+ "punpcklbh %[v], %[v], %[v] \n\t"
+ "punpcklbh %[v], %[v], %[zero] \n\t"
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
- "paddsh %[r_vec], %[y], %[br] \n\t"
- "pmullh %[temp], %[v], %[vr] \n\t"
- "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
- "psrah %[r_vec], %[r_vec], %[six] \n\t"
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
- "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
- "packushb %[g_vec], %[g_vec], %[zero] \n\t"
- "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t"
- "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
- "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
- "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
- "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t"
+ "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
+ "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
- "and %[g_vec], %[g_vec], %[mask1] \n\t"
- "psrlw %[g_vec], %[g_vec], %[four] \n\t"
- "psrlw %[r_vec], %[g_vec], %[four] \n\t"
- "or %[g_vec], %[g_vec], %[r_vec] \n\t"
- "punpcklbh %[r_vec], %[alpha], %[zero] \n\t"
- "and %[g_vec], %[g_vec], %[r_vec] \n\t"
+ "and %[g_vec], %[g_vec], %[mask1] \n\t"
+ "psrlw %[g_vec], %[g_vec], %[four] \n\t"
+ "psrlw %[r_vec], %[g_vec], %[four] \n\t"
+ "or %[g_vec], %[g_vec], %[r_vec] \n\t"
+ "punpcklbh %[r_vec], %[alpha], %[zero] \n\t"
+ "and %[g_vec], %[g_vec], %[r_vec] \n\t"
- "and %[b_vec], %[b_vec], %[mask1] \n\t"
- "psrlw %[b_vec], %[b_vec], %[four] \n\t"
- "psrlw %[r_vec], %[b_vec], %[four] \n\t"
- "or %[b_vec], %[b_vec], %[r_vec] \n\t"
- "punpcklbh %[r_vec], %[alpha], %[zero] \n\t"
- "and %[b_vec], %[b_vec], %[r_vec] \n\t"
- "packushb %[g_vec], %[g_vec], %[b_vec] \n\t"
+ "and %[b_vec], %[b_vec], %[mask1] \n\t"
+ "psrlw %[b_vec], %[b_vec], %[four] \n\t"
+ "psrlw %[r_vec], %[b_vec], %[four] \n\t"
+ "or %[b_vec], %[b_vec], %[r_vec] \n\t"
+ "punpcklbh %[r_vec], %[alpha], %[zero] \n\t"
+ "and %[b_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[b_vec] \n\t"
- "gssdlc1 %[g_vec], 0x07(%[dst_argb4444]) \n\t"
- "gssdrc1 %[g_vec], 0x00(%[dst_argb4444]) \n\t"
+ "gssdlc1 %[g_vec], 0x07(%[dst_argb4444]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[dst_argb4444]) \n\t"
- "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
- "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t"
- "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t"
- "daddiu %[dst_argb4444], %[dst_argb4444], 0x08 \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t"
+ "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t"
+ "daddiu %[dst_argb4444], %[dst_argb4444], 0x08 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
- : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
- [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
- [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
- [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
- : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v),
- [dst_argb4444] "r"(dst_argb4444), [yuvcons_ptr] "r"(yuvconstants),
- [width] "r"(width), [zero] "f"(0x00), [five] "f"(0x55), [six] "f"(0x6),
- [mask] "f"(0xff00ff00ff00ff00), [four] "f"(0x4),
- [mask1] "f"(0xf0f0f0f0f0f0f0f0), [alpha] "f"(-1)
- : "memory");
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u),
+ [v_ptr]"r"(src_v), [dst_argb4444]"r"(dst_argb4444),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask]"f"(0xff00ff00ff00ff00),
+ [four]"f"(0x4), [mask1]"f"(0xf0f0f0f0f0f0f0f0),
+ [alpha]"f"(-1)
+ : "memory"
+ );
}
void I422ToARGB1555Row_MMI(const uint8_t* src_y,
@@ -6645,118 +6754,125 @@ void I422ToARGB1555Row_MMI(const uint8_t* src_y,
int width) {
uint64_t y, u, v;
uint64_t b_vec, g_vec, r_vec, temp;
- uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
__asm__ volatile(
- "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
- "or %[ub], %[ub], %[mask1] \n\t"
- "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[ug], %[ug], %[zero] \n\t"
- "pshufh %[ug], %[ug], %[zero] \n\t"
- "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vg], %[vg], %[zero] \n\t"
- "pshufh %[vg], %[vg], %[five] \n\t"
- "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vr], %[vr], %[zero] \n\t"
- "pshufh %[vr], %[vr], %[five] \n\t"
- "or %[vr], %[vr], %[mask1] \n\t"
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask1] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask1] \n\t"
- "1: \n\t"
- "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
- "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
- "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
- "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
- "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
- "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
+ "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
+ "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
- "punpcklbh %[y], %[y], %[y] \n\t"
- "pmulhuh %[y], %[y], %[yg] \n\t"
+ "punpcklbh %[y], %[y], %[y] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
- // u3|u2|u1|u0 --> u1|u1|u0|u0
- "punpcklbh %[u], %[u], %[u] \n\t"
- "punpcklbh %[u], %[u], %[zero] \n\t"
- "paddsh %[b_vec], %[y], %[bb] \n\t"
- "pmullh %[temp], %[u], %[ub] \n\t"
- "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
- "psrah %[b_vec], %[b_vec], %[six] \n\t"
+ //u3|u2|u1|u0 --> u1|u1|u0|u0
+ "punpcklbh %[u], %[u], %[u] \n\t"
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
- // v3|v2|v1|v0 --> v1|v1|v0|v0
- "punpcklbh %[v], %[v], %[v] \n\t"
- "punpcklbh %[v], %[v], %[zero] \n\t"
- "paddsh %[g_vec], %[y], %[bg] \n\t"
- "pmullh %[temp], %[u], %[ug] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "pmullh %[temp], %[v], %[vg] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "psrah %[g_vec], %[g_vec], %[six] \n\t"
+ //v3|v2|v1|v0 --> v1|v1|v0|v0
+ "punpcklbh %[v], %[v], %[v] \n\t"
+ "punpcklbh %[v], %[v], %[zero] \n\t"
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
- "paddsh %[r_vec], %[y], %[br] \n\t"
- "pmullh %[temp], %[v], %[vr] \n\t"
- "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
- "psrah %[r_vec], %[r_vec], %[six] \n\t"
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
- "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
- "packushb %[g_vec], %[g_vec], %[zero] \n\t"
- "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
- "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
- "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
- "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
+ "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
- "psrlw %[temp], %[g_vec], %[three] \n\t"
- "and %[g_vec], %[temp], %[mask2] \n\t"
- "psrlw %[temp], %[temp], %[eight] \n\t"
- "and %[r_vec], %[temp], %[mask2] \n\t"
- "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
- "or %[g_vec], %[g_vec], %[r_vec] \n\t"
- "psrlw %[temp], %[temp], %[eight] \n\t"
- "and %[r_vec], %[temp], %[mask2] \n\t"
- "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
- "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
- "or %[g_vec], %[g_vec], %[r_vec] \n\t"
- "or %[g_vec], %[g_vec], %[mask3] \n\t"
+ "psrlw %[temp], %[g_vec], %[three] \n\t"
+ "and %[g_vec], %[temp], %[mask2] \n\t"
+ "psrlw %[temp], %[temp], %[eight] \n\t"
+ "and %[r_vec], %[temp], %[mask2] \n\t"
+ "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
+ "or %[g_vec], %[g_vec], %[r_vec] \n\t"
+ "psrlw %[temp], %[temp], %[eight] \n\t"
+ "and %[r_vec], %[temp], %[mask2] \n\t"
+ "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
+ "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
+ "or %[g_vec], %[g_vec], %[r_vec] \n\t"
+ "or %[g_vec], %[g_vec], %[mask3] \n\t"
- "psrlw %[temp], %[b_vec], %[three] \n\t"
- "and %[b_vec], %[temp], %[mask2] \n\t"
- "psrlw %[temp], %[temp], %[eight] \n\t"
- "and %[r_vec], %[temp], %[mask2] \n\t"
- "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
- "or %[b_vec], %[b_vec], %[r_vec] \n\t"
- "psrlw %[temp], %[temp], %[eight] \n\t"
- "and %[r_vec], %[temp], %[mask2] \n\t"
- "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
- "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
- "or %[b_vec], %[b_vec], %[r_vec] \n\t"
- "or %[b_vec], %[b_vec], %[mask3] \n\t"
+ "psrlw %[temp], %[b_vec], %[three] \n\t"
+ "and %[b_vec], %[temp], %[mask2] \n\t"
+ "psrlw %[temp], %[temp], %[eight] \n\t"
+ "and %[r_vec], %[temp], %[mask2] \n\t"
+ "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
+ "or %[b_vec], %[b_vec], %[r_vec] \n\t"
+ "psrlw %[temp], %[temp], %[eight] \n\t"
+ "and %[r_vec], %[temp], %[mask2] \n\t"
+ "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
+ "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
+ "or %[b_vec], %[b_vec], %[r_vec] \n\t"
+ "or %[b_vec], %[b_vec], %[mask3] \n\t"
- "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t"
- "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t"
- "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t"
+ "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t"
+ "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t"
+ "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t"
- "gssdlc1 %[g_vec], 0x07(%[dst_argb1555]) \n\t"
- "gssdrc1 %[g_vec], 0x00(%[dst_argb1555]) \n\t"
+ "gssdlc1 %[g_vec], 0x07(%[dst_argb1555]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[dst_argb1555]) \n\t"
- "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
- "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t"
- "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t"
- "daddiu %[dst_argb1555], %[dst_argb1555], 0x08 \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t"
+ "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t"
+ "daddiu %[dst_argb1555], %[dst_argb1555], 0x08 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
- : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
- [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
- [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
- [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
- : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v),
- [dst_argb1555] "r"(dst_argb1555), [yuvcons_ptr] "r"(yuvconstants),
- [width] "r"(width), [zero] "f"(0x00), [five] "f"(0x55), [six] "f"(0x6),
- [mask1] "f"(0xff00ff00ff00ff00), [three] "f"(0x3),
- [mask2] "f"(0x1f0000001f), [eight] "f"(0x8),
- [mask3] "f"(0x800000008000), [lmove5] "f"(0x5)
- : "memory");
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u),
+ [v_ptr]"r"(src_v), [dst_argb1555]"r"(dst_argb1555),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00),
+ [three]"f"(0x3), [mask2]"f"(0x1f0000001f),
+ [eight]"f"(0x8), [mask3]"f"(0x800000008000),
+ [lmove5]"f"(0x5)
+ : "memory"
+ );
}
void I422ToRGB565Row_MMI(const uint8_t* src_y,
@@ -6767,120 +6883,127 @@ void I422ToRGB565Row_MMI(const uint8_t* src_y,
int width) {
uint64_t y, u, v;
uint64_t b_vec, g_vec, r_vec, temp;
- uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
__asm__ volatile(
- "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
- "or %[ub], %[ub], %[mask1] \n\t"
- "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[ug], %[ug], %[zero] \n\t"
- "pshufh %[ug], %[ug], %[zero] \n\t"
- "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vg], %[vg], %[zero] \n\t"
- "pshufh %[vg], %[vg], %[five] \n\t"
- "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vr], %[vr], %[zero] \n\t"
- "pshufh %[vr], %[vr], %[five] \n\t"
- "or %[vr], %[vr], %[mask1] \n\t"
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask1] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask1] \n\t"
- "1: \n\t"
- "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
- "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
- "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
- "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
- "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
- "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
+ "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
+ "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
- "punpcklbh %[y], %[y], %[y] \n\t"
- "pmulhuh %[y], %[y], %[yg] \n\t"
+ "punpcklbh %[y], %[y], %[y] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
- // u3|u2|u1|u0 --> u1|u1|u0|u0
- "punpcklbh %[u], %[u], %[u] \n\t"
- "punpcklbh %[u], %[u], %[zero] \n\t"
- "paddsh %[b_vec], %[y], %[bb] \n\t"
- "pmullh %[temp], %[u], %[ub] \n\t"
- "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
- "psrah %[b_vec], %[b_vec], %[six] \n\t"
+ //u3|u2|u1|u0 --> u1|u1|u0|u0
+ "punpcklbh %[u], %[u], %[u] \n\t"
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
- // v3|v2|v1|v0 --> v1|v1|v0|v0
- "punpcklbh %[v], %[v], %[v] \n\t"
- "punpcklbh %[v], %[v], %[zero] \n\t"
- "paddsh %[g_vec], %[y], %[bg] \n\t"
- "pmullh %[temp], %[u], %[ug] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "pmullh %[temp], %[v], %[vg] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "psrah %[g_vec], %[g_vec], %[six] \n\t"
+ //v3|v2|v1|v0 --> v1|v1|v0|v0
+ "punpcklbh %[v], %[v], %[v] \n\t"
+ "punpcklbh %[v], %[v], %[zero] \n\t"
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
- "paddsh %[r_vec], %[y], %[br] \n\t"
- "pmullh %[temp], %[v], %[vr] \n\t"
- "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
- "psrah %[r_vec], %[r_vec], %[six] \n\t"
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
- "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
- "packushb %[g_vec], %[g_vec], %[zero] \n\t"
- "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
- "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
- "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
- "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
+ "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
- "psrlh %[temp], %[g_vec], %[three] \n\t"
- "and %[g_vec], %[temp], %[mask2] \n\t"
- "psrlw %[temp], %[temp], %[seven] \n\t"
- "psrlw %[r_vec], %[mask1], %[eight] \n\t"
- "and %[r_vec], %[temp], %[r_vec] \n\t"
- "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
- "or %[g_vec], %[g_vec], %[r_vec] \n\t"
- "paddb %[r_vec], %[three], %[six] \n\t"
- "psrlw %[temp], %[temp], %[r_vec] \n\t"
- "and %[r_vec], %[temp], %[mask2] \n\t"
- "paddb %[temp], %[three], %[eight] \n\t"
- "psllw %[r_vec], %[r_vec], %[temp] \n\t"
- "or %[g_vec], %[g_vec], %[r_vec] \n\t"
+ "psrlh %[temp], %[g_vec], %[three] \n\t"
+ "and %[g_vec], %[temp], %[mask2] \n\t"
+ "psrlw %[temp], %[temp], %[seven] \n\t"
+ "psrlw %[r_vec], %[mask1], %[eight] \n\t"
+ "and %[r_vec], %[temp], %[r_vec] \n\t"
+ "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
+ "or %[g_vec], %[g_vec], %[r_vec] \n\t"
+ "paddb %[r_vec], %[three], %[six] \n\t"
+ "psrlw %[temp], %[temp], %[r_vec] \n\t"
+ "and %[r_vec], %[temp], %[mask2] \n\t"
+ "paddb %[temp], %[three], %[eight] \n\t"
+ "psllw %[r_vec], %[r_vec], %[temp] \n\t"
+ "or %[g_vec], %[g_vec], %[r_vec] \n\t"
- "psrlh %[temp], %[b_vec], %[three] \n\t"
- "and %[b_vec], %[temp], %[mask2] \n\t"
- "psrlw %[temp], %[temp], %[seven] \n\t"
- "psrlw %[r_vec], %[mask1], %[eight] \n\t"
- "and %[r_vec], %[temp], %[r_vec] \n\t"
- "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
- "or %[b_vec], %[b_vec], %[r_vec] \n\t"
- "paddb %[r_vec], %[three], %[six] \n\t"
- "psrlw %[temp], %[temp], %[r_vec] \n\t"
- "and %[r_vec], %[temp], %[mask2] \n\t"
- "paddb %[temp], %[three], %[eight] \n\t"
- "psllw %[r_vec], %[r_vec], %[temp] \n\t"
- "or %[b_vec], %[b_vec], %[r_vec] \n\t"
+ "psrlh %[temp], %[b_vec], %[three] \n\t"
+ "and %[b_vec], %[temp], %[mask2] \n\t"
+ "psrlw %[temp], %[temp], %[seven] \n\t"
+ "psrlw %[r_vec], %[mask1], %[eight] \n\t"
+ "and %[r_vec], %[temp], %[r_vec] \n\t"
+ "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
+ "or %[b_vec], %[b_vec], %[r_vec] \n\t"
+ "paddb %[r_vec], %[three], %[six] \n\t"
+ "psrlw %[temp], %[temp], %[r_vec] \n\t"
+ "and %[r_vec], %[temp], %[mask2] \n\t"
+ "paddb %[temp], %[three], %[eight] \n\t"
+ "psllw %[r_vec], %[r_vec], %[temp] \n\t"
+ "or %[b_vec], %[b_vec], %[r_vec] \n\t"
- "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t"
- "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t"
- "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t"
+ "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t"
+ "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t"
+ "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t"
- "gssdlc1 %[g_vec], 0x07(%[dst_rgb565]) \n\t"
- "gssdrc1 %[g_vec], 0x00(%[dst_rgb565]) \n\t"
+ "gssdlc1 %[g_vec], 0x07(%[dst_rgb565]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[dst_rgb565]) \n\t"
- "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
- "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t"
- "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t"
- "daddiu %[dst_rgb565], %[dst_rgb565], 0x08 \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t"
+ "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t"
+ "daddiu %[dst_rgb565], %[dst_rgb565], 0x08 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
- : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
- [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
- [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
- [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
- : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v),
- [dst_rgb565] "r"(dst_rgb565), [yuvcons_ptr] "r"(yuvconstants),
- [width] "r"(width), [zero] "f"(0x00), [five] "f"(0x55), [six] "f"(0x6),
- [mask1] "f"(0xff00ff00ff00ff00), [three] "f"(0x3),
- [mask2] "f"(0x1f0000001f), [eight] "f"(0x8), [seven] "f"(0x7),
- [lmove5] "f"(0x5)
- : "memory");
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u),
+ [v_ptr]"r"(src_v), [dst_rgb565]"r"(dst_rgb565),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00),
+ [three]"f"(0x3), [mask2]"f"(0x1f0000001f),
+ [eight]"f"(0x8), [seven]"f"(0x7),
+ [lmove5]"f"(0x5)
+ : "memory"
+ );
}
void NV12ToARGBRow_MMI(const uint8_t* src_y,
@@ -6890,83 +7013,91 @@ void NV12ToARGBRow_MMI(const uint8_t* src_y,
int width) {
uint64_t y, u, v;
uint64_t b_vec, g_vec, r_vec, temp;
- uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
__asm__ volatile(
- "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
- "or %[ub], %[ub], %[mask1] \n\t"
- "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[ug], %[ug], %[zero] \n\t"
- "pshufh %[ug], %[ug], %[zero] \n\t"
- "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vg], %[vg], %[zero] \n\t"
- "pshufh %[vg], %[vg], %[five] \n\t"
- "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vr], %[vr], %[zero] \n\t"
- "pshufh %[vr], %[vr], %[five] \n\t"
- "or %[vr], %[vr], %[mask1] \n\t"
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask1] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask1] \n\t"
- "1: \n\t"
- "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
- "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
- "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t"
- "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t"
- "punpcklbh %[u], %[u], %[zero] \n\t"
- "pshufh %[v], %[u], %[vshu] \n\t"
- "pshufh %[u], %[u], %[ushu] \n\t"
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t"
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "pshufh %[v], %[u], %[vshu] \n\t"
+ "pshufh %[u], %[u], %[ushu] \n\t"
- "punpcklbh %[y], %[y], %[y] \n\t"
- "pmulhuh %[y], %[y], %[yg] \n\t"
+ "punpcklbh %[y], %[y], %[y] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
- "paddsh %[b_vec], %[y], %[bb] \n\t"
- "pmullh %[temp], %[u], %[ub] \n\t"
- "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
- "psrah %[b_vec], %[b_vec], %[six] \n\t"
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
- "paddsh %[g_vec], %[y], %[bg] \n\t"
- "pmullh %[temp], %[u], %[ug] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "pmullh %[temp], %[v], %[vg] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "psrah %[g_vec], %[g_vec], %[six] \n\t"
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
- "paddsh %[r_vec], %[y], %[br] \n\t"
- "pmullh %[temp], %[v], %[vr] \n\t"
- "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
- "psrah %[r_vec], %[r_vec], %[six] \n\t"
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
- "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
- "packushb %[g_vec], %[g_vec], %[zero] \n\t"
- "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t"
- "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
- "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
- "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
- "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t"
+ "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
+ "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
- "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t"
- "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t"
- "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t"
- "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t"
+ "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t"
- "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
- "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t"
- "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
- : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
- [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
- [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
- [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
- : [y_ptr] "r"(src_y), [uv_ptr] "r"(src_uv), [rgbbuf_ptr] "r"(rgb_buf),
- [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00),
- [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00),
- [ushu] "f"(0xA0), [vshu] "f"(0xf5), [alpha] "f"(-1)
- : "memory");
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [uv_ptr]"r"(src_uv),
+ [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00),
+ [ushu]"f"(0xA0), [vshu]"f"(0xf5),
+ [alpha]"f"(-1)
+ : "memory"
+ );
}
void NV21ToARGBRow_MMI(const uint8_t* src_y,
@@ -6976,83 +7107,91 @@ void NV21ToARGBRow_MMI(const uint8_t* src_y,
int width) {
uint64_t y, u, v;
uint64_t b_vec, g_vec, r_vec, temp;
- uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
__asm__ volatile(
- "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
- "or %[ub], %[ub], %[mask1] \n\t"
- "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[ug], %[ug], %[zero] \n\t"
- "pshufh %[ug], %[ug], %[zero] \n\t"
- "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vg], %[vg], %[zero] \n\t"
- "pshufh %[vg], %[vg], %[five] \n\t"
- "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vr], %[vr], %[zero] \n\t"
- "pshufh %[vr], %[vr], %[five] \n\t"
- "or %[vr], %[vr], %[mask1] \n\t"
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask1] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask1] \n\t"
- "1: \n\t"
- "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
- "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
- "gslwlc1 %[u], 0x03(%[vu_ptr]) \n\t"
- "gslwrc1 %[u], 0x00(%[vu_ptr]) \n\t"
- "punpcklbh %[u], %[u], %[zero] \n\t"
- "pshufh %[v], %[u], %[ushu] \n\t"
- "pshufh %[u], %[u], %[vshu] \n\t"
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[vu_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[vu_ptr]) \n\t"
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "pshufh %[v], %[u], %[ushu] \n\t"
+ "pshufh %[u], %[u], %[vshu] \n\t"
- "punpcklbh %[y], %[y], %[y] \n\t"
- "pmulhuh %[y], %[y], %[yg] \n\t"
+ "punpcklbh %[y], %[y], %[y] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
- "paddsh %[b_vec], %[y], %[bb] \n\t"
- "pmullh %[temp], %[u], %[ub] \n\t"
- "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
- "psrah %[b_vec], %[b_vec], %[six] \n\t"
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
- "paddsh %[g_vec], %[y], %[bg] \n\t"
- "pmullh %[temp], %[u], %[ug] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "pmullh %[temp], %[v], %[vg] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "psrah %[g_vec], %[g_vec], %[six] \n\t"
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
- "paddsh %[r_vec], %[y], %[br] \n\t"
- "pmullh %[temp], %[v], %[vr] \n\t"
- "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
- "psrah %[r_vec], %[r_vec], %[six] \n\t"
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
- "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
- "packushb %[g_vec], %[g_vec], %[zero] \n\t"
- "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t"
- "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
- "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
- "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
- "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t"
+ "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
+ "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
- "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t"
- "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t"
- "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t"
- "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t"
+ "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t"
- "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
- "daddiu %[vu_ptr], %[vu_ptr], 0x04 \n\t"
- "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[vu_ptr], %[vu_ptr], 0x04 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
- : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
- [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
- [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
- [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
- : [y_ptr] "r"(src_y), [vu_ptr] "r"(src_vu), [rgbbuf_ptr] "r"(rgb_buf),
- [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00),
- [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00),
- [ushu] "f"(0xA0), [vshu] "f"(0xf5), [alpha] "f"(-1)
- : "memory");
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [vu_ptr]"r"(src_vu),
+ [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00),
+ [ushu]"f"(0xA0), [vshu]"f"(0xf5),
+ [alpha]"f"(-1)
+ : "memory"
+ );
}
void NV12ToRGB24Row_MMI(const uint8_t* src_y,
@@ -7062,95 +7201,103 @@ void NV12ToRGB24Row_MMI(const uint8_t* src_y,
int width) {
uint64_t y, u, v;
uint64_t b_vec, g_vec, r_vec, temp;
- uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
__asm__ volatile(
- "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
- "or %[ub], %[ub], %[mask1] \n\t"
- "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[ug], %[ug], %[zero] \n\t"
- "pshufh %[ug], %[ug], %[zero] \n\t"
- "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vg], %[vg], %[zero] \n\t"
- "pshufh %[vg], %[vg], %[five] \n\t"
- "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vr], %[vr], %[zero] \n\t"
- "pshufh %[vr], %[vr], %[five] \n\t"
- "or %[vr], %[vr], %[mask1] \n\t"
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask1] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask1] \n\t"
- "1: \n\t"
- "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
- "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
- "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t"
- "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t"
- "punpcklbh %[u], %[u], %[zero] \n\t"
- "pshufh %[v], %[u], %[vshu] \n\t"
- "pshufh %[u], %[u], %[ushu] \n\t"
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t"
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "pshufh %[v], %[u], %[vshu] \n\t"
+ "pshufh %[u], %[u], %[ushu] \n\t"
- "punpcklbh %[y], %[y], %[y] \n\t"
- "pmulhuh %[y], %[y], %[yg] \n\t"
+ "punpcklbh %[y], %[y], %[y] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
- "paddsh %[b_vec], %[y], %[bb] \n\t"
- "pmullh %[temp], %[u], %[ub] \n\t"
- "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
- "psrah %[b_vec], %[b_vec], %[six] \n\t"
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
- "paddsh %[g_vec], %[y], %[bg] \n\t"
- "pmullh %[temp], %[u], %[ug] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "pmullh %[temp], %[v], %[vg] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "psrah %[g_vec], %[g_vec], %[six] \n\t"
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
- "paddsh %[r_vec], %[y], %[br] \n\t"
- "pmullh %[temp], %[v], %[vr] \n\t"
- "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
- "psrah %[r_vec], %[r_vec], %[six] \n\t"
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
- "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
- "packushb %[g_vec], %[g_vec], %[zero] \n\t"
- "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
- "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
- "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
- "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
+ "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
- "punpckhwd %[r_vec], %[g_vec], %[g_vec] \n\t"
- "psllw %[temp], %[r_vec], %[lmove1] \n\t"
- "or %[g_vec], %[g_vec], %[temp] \n\t"
- "psrlw %[temp], %[r_vec], %[rmove1] \n\t"
- "pextrh %[temp], %[temp], %[zero] \n\t"
- "pinsrh_2 %[g_vec], %[g_vec], %[temp] \n\t"
- "pextrh %[temp], %[b_vec], %[zero] \n\t"
- "pinsrh_3 %[g_vec], %[g_vec], %[temp] \n\t"
- "pextrh %[temp], %[b_vec], %[one] \n\t"
- "punpckhwd %[b_vec], %[b_vec], %[b_vec] \n\t"
- "psllw %[b_vec], %[b_vec], %[rmove1] \n\t"
- "or %[b_vec], %[b_vec], %[temp] \n\t"
- "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t"
- "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t"
- "gsswlc1 %[b_vec], 0x0b(%[rgbbuf_ptr]) \n\t"
- "gsswrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t"
+ "punpckhwd %[r_vec], %[g_vec], %[g_vec] \n\t"
+ "psllw %[temp], %[r_vec], %[lmove1] \n\t"
+ "or %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrlw %[temp], %[r_vec], %[rmove1] \n\t"
+ "pextrh %[temp], %[temp], %[zero] \n\t"
+ "pinsrh_2 %[g_vec], %[g_vec], %[temp] \n\t"
+ "pextrh %[temp], %[b_vec], %[zero] \n\t"
+ "pinsrh_3 %[g_vec], %[g_vec], %[temp] \n\t"
+ "pextrh %[temp], %[b_vec], %[one] \n\t"
+ "punpckhwd %[b_vec], %[b_vec], %[b_vec] \n\t"
+ "psllw %[b_vec], %[b_vec], %[rmove1] \n\t"
+ "or %[b_vec], %[b_vec], %[temp] \n\t"
+ "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gsswlc1 %[b_vec], 0x0b(%[rgbbuf_ptr]) \n\t"
+ "gsswrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t"
- "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
- "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t"
- "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0C \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0C \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
- : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
- [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
- [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
- [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
- : [y_ptr] "r"(src_y), [uv_ptr] "r"(src_uv), [rgbbuf_ptr] "r"(rgb_buf),
- [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00),
- [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00),
- [ushu] "f"(0xA0), [vshu] "f"(0xf5), [alpha] "f"(-1), [lmove1] "f"(0x18),
- [one] "f"(0x1), [rmove1] "f"(0x8)
- : "memory");
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [uv_ptr]"r"(src_uv),
+ [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00),
+ [ushu]"f"(0xA0), [vshu]"f"(0xf5),
+ [alpha]"f"(-1), [lmove1]"f"(0x18),
+ [one]"f"(0x1), [rmove1]"f"(0x8)
+ : "memory"
+ );
}
void NV21ToRGB24Row_MMI(const uint8_t* src_y,
@@ -7160,95 +7307,103 @@ void NV21ToRGB24Row_MMI(const uint8_t* src_y,
int width) {
uint64_t y, u, v;
uint64_t b_vec, g_vec, r_vec, temp;
- uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
__asm__ volatile(
- "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
- "or %[ub], %[ub], %[mask1] \n\t"
- "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[ug], %[ug], %[zero] \n\t"
- "pshufh %[ug], %[ug], %[zero] \n\t"
- "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vg], %[vg], %[zero] \n\t"
- "pshufh %[vg], %[vg], %[five] \n\t"
- "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vr], %[vr], %[zero] \n\t"
- "pshufh %[vr], %[vr], %[five] \n\t"
- "or %[vr], %[vr], %[mask1] \n\t"
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask1] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask1] \n\t"
- "1: \n\t"
- "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
- "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
- "gslwlc1 %[u], 0x03(%[vu_ptr]) \n\t"
- "gslwrc1 %[u], 0x00(%[vu_ptr]) \n\t"
- "punpcklbh %[u], %[u], %[zero] \n\t"
- "pshufh %[v], %[u], %[ushu] \n\t"
- "pshufh %[u], %[u], %[vshu] \n\t"
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[vu_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[vu_ptr]) \n\t"
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "pshufh %[v], %[u], %[ushu] \n\t"
+ "pshufh %[u], %[u], %[vshu] \n\t"
- "punpcklbh %[y], %[y], %[y] \n\t"
- "pmulhuh %[y], %[y], %[yg] \n\t"
+ "punpcklbh %[y], %[y], %[y] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
- "paddsh %[b_vec], %[y], %[bb] \n\t"
- "pmullh %[temp], %[u], %[ub] \n\t"
- "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
- "psrah %[b_vec], %[b_vec], %[six] \n\t"
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
- "paddsh %[g_vec], %[y], %[bg] \n\t"
- "pmullh %[temp], %[u], %[ug] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "pmullh %[temp], %[v], %[vg] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "psrah %[g_vec], %[g_vec], %[six] \n\t"
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
- "paddsh %[r_vec], %[y], %[br] \n\t"
- "pmullh %[temp], %[v], %[vr] \n\t"
- "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
- "psrah %[r_vec], %[r_vec], %[six] \n\t"
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
- "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
- "packushb %[g_vec], %[g_vec], %[zero] \n\t"
- "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
- "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
- "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
- "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
+ "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
- "punpckhwd %[r_vec], %[g_vec], %[g_vec] \n\t"
- "psllw %[temp], %[r_vec], %[lmove1] \n\t"
- "or %[g_vec], %[g_vec], %[temp] \n\t"
- "psrlw %[temp], %[r_vec], %[rmove1] \n\t"
- "pextrh %[temp], %[temp], %[zero] \n\t"
- "pinsrh_2 %[g_vec], %[g_vec], %[temp] \n\t"
- "pextrh %[temp], %[b_vec], %[zero] \n\t"
- "pinsrh_3 %[g_vec], %[g_vec], %[temp] \n\t"
- "pextrh %[temp], %[b_vec], %[one] \n\t"
- "punpckhwd %[b_vec], %[b_vec], %[b_vec] \n\t"
- "psllw %[b_vec], %[b_vec], %[rmove1] \n\t"
- "or %[b_vec], %[b_vec], %[temp] \n\t"
- "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t"
- "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t"
- "gsswlc1 %[b_vec], 0x0b(%[rgbbuf_ptr]) \n\t"
- "gsswrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t"
+ "punpckhwd %[r_vec], %[g_vec], %[g_vec] \n\t"
+ "psllw %[temp], %[r_vec], %[lmove1] \n\t"
+ "or %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrlw %[temp], %[r_vec], %[rmove1] \n\t"
+ "pextrh %[temp], %[temp], %[zero] \n\t"
+ "pinsrh_2 %[g_vec], %[g_vec], %[temp] \n\t"
+ "pextrh %[temp], %[b_vec], %[zero] \n\t"
+ "pinsrh_3 %[g_vec], %[g_vec], %[temp] \n\t"
+ "pextrh %[temp], %[b_vec], %[one] \n\t"
+ "punpckhwd %[b_vec], %[b_vec], %[b_vec] \n\t"
+ "psllw %[b_vec], %[b_vec], %[rmove1] \n\t"
+ "or %[b_vec], %[b_vec], %[temp] \n\t"
+ "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gsswlc1 %[b_vec], 0x0b(%[rgbbuf_ptr]) \n\t"
+ "gsswrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t"
- "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
- "daddiu %[vu_ptr], %[vu_ptr], 0x04 \n\t"
- "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0C \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[vu_ptr], %[vu_ptr], 0x04 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0C \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
- : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
- [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
- [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
- [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
- : [y_ptr] "r"(src_y), [vu_ptr] "r"(src_vu), [rgbbuf_ptr] "r"(rgb_buf),
- [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00),
- [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00),
- [ushu] "f"(0xA0), [vshu] "f"(0xf5), [lmove1] "f"(0x18),
- [rmove1] "f"(0x8), [one] "f"(0x1)
- : "memory");
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [vu_ptr]"r"(src_vu),
+ [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00),
+ [ushu]"f"(0xA0), [vshu]"f"(0xf5),
+ [lmove1]"f"(0x18), [rmove1]"f"(0x8),
+ [one]"f"(0x1)
+ : "memory"
+ );
}
void NV12ToRGB565Row_MMI(const uint8_t* src_y,
@@ -7258,115 +7413,123 @@ void NV12ToRGB565Row_MMI(const uint8_t* src_y,
int width) {
uint64_t y, u, v;
uint64_t b_vec, g_vec, r_vec, temp;
- uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
__asm__ volatile(
- "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
- "or %[ub], %[ub], %[mask1] \n\t"
- "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[ug], %[ug], %[zero] \n\t"
- "pshufh %[ug], %[ug], %[zero] \n\t"
- "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vg], %[vg], %[zero] \n\t"
- "pshufh %[vg], %[vg], %[five] \n\t"
- "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vr], %[vr], %[zero] \n\t"
- "pshufh %[vr], %[vr], %[five] \n\t"
- "or %[vr], %[vr], %[mask1] \n\t"
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask1] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask1] \n\t"
- "1: \n\t"
- "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
- "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
- "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t"
- "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t"
- "punpcklbh %[u], %[u], %[zero] \n\t"
- "pshufh %[v], %[u], %[vshu] \n\t"
- "pshufh %[u], %[u], %[ushu] \n\t"
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t"
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "pshufh %[v], %[u], %[vshu] \n\t"
+ "pshufh %[u], %[u], %[ushu] \n\t"
- "punpcklbh %[y], %[y], %[y] \n\t"
- "pmulhuh %[y], %[y], %[yg] \n\t"
+ "punpcklbh %[y], %[y], %[y] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
- "paddsh %[b_vec], %[y], %[bb] \n\t"
- "pmullh %[temp], %[u], %[ub] \n\t"
- "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
- "psrah %[b_vec], %[b_vec], %[six] \n\t"
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
- "paddsh %[g_vec], %[y], %[bg] \n\t"
- "pmullh %[temp], %[u], %[ug] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "pmullh %[temp], %[v], %[vg] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "psrah %[g_vec], %[g_vec], %[six] \n\t"
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
- "paddsh %[r_vec], %[y], %[br] \n\t"
- "pmullh %[temp], %[v], %[vr] \n\t"
- "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
- "psrah %[r_vec], %[r_vec], %[six] \n\t"
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
- "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
- "packushb %[g_vec], %[g_vec], %[zero] \n\t"
- "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
- "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
- "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
- "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
+ "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
- "psrlh %[temp], %[g_vec], %[three] \n\t"
- "and %[g_vec], %[temp], %[mask2] \n\t"
- "psrlw %[temp], %[temp], %[seven] \n\t"
- "psrlw %[r_vec], %[mask1], %[eight] \n\t"
- "and %[r_vec], %[temp], %[r_vec] \n\t"
- "psubb %[y], %[eight], %[three] \n\t" // 5
- "psllw %[r_vec], %[r_vec], %[y] \n\t"
- "or %[g_vec], %[g_vec], %[r_vec] \n\t"
- "paddb %[r_vec], %[three], %[six] \n\t"
- "psrlw %[temp], %[temp], %[r_vec] \n\t"
- "and %[r_vec], %[temp], %[mask2] \n\t"
- "paddb %[temp], %[three], %[eight] \n\t"
- "psllw %[r_vec], %[r_vec], %[temp] \n\t"
- "or %[g_vec], %[g_vec], %[r_vec] \n\t"
+ "psrlh %[temp], %[g_vec], %[three] \n\t"
+ "and %[g_vec], %[temp], %[mask2] \n\t"
+ "psrlw %[temp], %[temp], %[seven] \n\t"
+ "psrlw %[r_vec], %[mask1], %[eight] \n\t"
+ "and %[r_vec], %[temp], %[r_vec] \n\t"
+ "psubb %[y], %[eight], %[three] \n\t"//5
+ "psllw %[r_vec], %[r_vec], %[y] \n\t"
+ "or %[g_vec], %[g_vec], %[r_vec] \n\t"
+ "paddb %[r_vec], %[three], %[six] \n\t"
+ "psrlw %[temp], %[temp], %[r_vec] \n\t"
+ "and %[r_vec], %[temp], %[mask2] \n\t"
+ "paddb %[temp], %[three], %[eight] \n\t"
+ "psllw %[r_vec], %[r_vec], %[temp] \n\t"
+ "or %[g_vec], %[g_vec], %[r_vec] \n\t"
- "psrlh %[temp], %[b_vec], %[three] \n\t"
- "and %[b_vec], %[temp], %[mask2] \n\t"
- "psrlw %[temp], %[temp], %[seven] \n\t"
- "psrlw %[r_vec], %[mask1], %[eight] \n\t"
- "and %[r_vec], %[temp], %[r_vec] \n\t"
- "psubb %[y], %[eight], %[three] \n\t" // 5
- "psllw %[r_vec], %[r_vec], %[y] \n\t"
- "or %[b_vec], %[b_vec], %[r_vec] \n\t"
- "paddb %[r_vec], %[three], %[six] \n\t"
- "psrlw %[temp], %[temp], %[r_vec] \n\t"
- "and %[r_vec], %[temp], %[mask2] \n\t"
- "paddb %[temp], %[three], %[eight] \n\t"
- "psllw %[r_vec], %[r_vec], %[temp] \n\t"
- "or %[b_vec], %[b_vec], %[r_vec] \n\t"
+ "psrlh %[temp], %[b_vec], %[three] \n\t"
+ "and %[b_vec], %[temp], %[mask2] \n\t"
+ "psrlw %[temp], %[temp], %[seven] \n\t"
+ "psrlw %[r_vec], %[mask1], %[eight] \n\t"
+ "and %[r_vec], %[temp], %[r_vec] \n\t"
+ "psubb %[y], %[eight], %[three] \n\t"//5
+ "psllw %[r_vec], %[r_vec], %[y] \n\t"
+ "or %[b_vec], %[b_vec], %[r_vec] \n\t"
+ "paddb %[r_vec], %[three], %[six] \n\t"
+ "psrlw %[temp], %[temp], %[r_vec] \n\t"
+ "and %[r_vec], %[temp], %[mask2] \n\t"
+ "paddb %[temp], %[three], %[eight] \n\t"
+ "psllw %[r_vec], %[r_vec], %[temp] \n\t"
+ "or %[b_vec], %[b_vec], %[r_vec] \n\t"
- "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t"
- "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t"
- "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t"
+ "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t"
+ "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t"
+ "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t"
- "gssdlc1 %[g_vec], 0x07(%[dst_rgb565]) \n\t"
- "gssdrc1 %[g_vec], 0x00(%[dst_rgb565]) \n\t"
+ "gssdlc1 %[g_vec], 0x07(%[dst_rgb565]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[dst_rgb565]) \n\t"
- "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
- "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t"
- "daddiu %[dst_rgb565], %[dst_rgb565], 0x08 \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t"
+ "daddiu %[dst_rgb565], %[dst_rgb565], 0x08 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
- : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
- [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
- [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
- [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
- : [y_ptr] "r"(src_y), [uv_ptr] "r"(src_uv), [dst_rgb565] "r"(dst_rgb565),
- [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00),
- [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00),
- [ushu] "f"(0xA0), [vshu] "f"(0xf5), [three] "f"(0x3),
- [mask2] "f"(0x1f0000001f), [eight] "f"(0x8), [seven] "f"(0x7)
- : "memory");
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [uv_ptr]"r"(src_uv),
+ [dst_rgb565]"r"(dst_rgb565),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00),
+ [ushu]"f"(0xA0), [vshu]"f"(0xf5),
+ [three]"f"(0x3), [mask2]"f"(0x1f0000001f),
+ [eight]"f"(0x8), [seven]"f"(0x7)
+ : "memory"
+ );
}
void YUY2ToARGBRow_MMI(const uint8_t* src_yuy2,
@@ -7375,83 +7538,90 @@ void YUY2ToARGBRow_MMI(const uint8_t* src_yuy2,
int width) {
uint64_t y, u, v;
uint64_t b_vec, g_vec, r_vec, temp;
- uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
__asm__ volatile(
- "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
- "or %[ub], %[ub], %[mask1] \n\t"
- "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[ug], %[ug], %[zero] \n\t"
- "pshufh %[ug], %[ug], %[zero] \n\t"
- "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vg], %[vg], %[zero] \n\t"
- "pshufh %[vg], %[vg], %[five] \n\t"
- "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vr], %[vr], %[zero] \n\t"
- "pshufh %[vr], %[vr], %[five] \n\t"
- "or %[vr], %[vr], %[mask1] \n\t"
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask1] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask1] \n\t"
- "1: \n\t"
- "gsldlc1 %[y], 0x07(%[yuy2_ptr]) \n\t"
- "gsldrc1 %[y], 0x00(%[yuy2_ptr]) \n\t"
- "psrlh %[temp], %[y], %[eight] \n\t"
- "pshufh %[u], %[temp], %[ushu] \n\t"
- "pshufh %[v], %[temp], %[vshu] \n\t"
+ "1: \n\t"
+ "gsldlc1 %[y], 0x07(%[yuy2_ptr]) \n\t"
+ "gsldrc1 %[y], 0x00(%[yuy2_ptr]) \n\t"
+ "psrlh %[temp], %[y], %[eight] \n\t"
+ "pshufh %[u], %[temp], %[ushu] \n\t"
+ "pshufh %[v], %[temp], %[vshu] \n\t"
- "psrlh %[temp], %[mask1], %[eight] \n\t"
- "and %[y], %[y], %[temp] \n\t"
- "psllh %[temp], %[y], %[eight] \n\t"
- "or %[y], %[y], %[temp] \n\t"
- "pmulhuh %[y], %[y], %[yg] \n\t"
+ "psrlh %[temp], %[mask1], %[eight] \n\t"
+ "and %[y], %[y], %[temp] \n\t"
+ "psllh %[temp], %[y], %[eight] \n\t"
+ "or %[y], %[y], %[temp] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
- "paddsh %[b_vec], %[y], %[bb] \n\t"
- "pmullh %[temp], %[u], %[ub] \n\t"
- "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
- "psrah %[b_vec], %[b_vec], %[six] \n\t"
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
- "paddsh %[g_vec], %[y], %[bg] \n\t"
- "pmullh %[temp], %[u], %[ug] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "pmullh %[temp], %[v], %[vg] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "psrah %[g_vec], %[g_vec], %[six] \n\t"
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
- "paddsh %[r_vec], %[y], %[br] \n\t"
- "pmullh %[temp], %[v], %[vr] \n\t"
- "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
- "psrah %[r_vec], %[r_vec], %[six] \n\t"
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
- "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
- "packushb %[g_vec], %[g_vec], %[zero] \n\t"
- "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t"
- "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
- "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
- "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
- "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t"
+ "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
+ "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
- "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t"
- "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t"
- "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t"
- "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t"
+ "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t"
- "daddiu %[yuy2_ptr], %[yuy2_ptr], 0x08 \n\t"
- "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
+ "daddiu %[yuy2_ptr], %[yuy2_ptr], 0x08 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
- : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
- [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
- [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
- [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
- : [yuy2_ptr] "r"(src_yuy2), [rgbbuf_ptr] "r"(rgb_buf),
- [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00),
- [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00),
- [ushu] "f"(0xA0), [vshu] "f"(0xf5), [alpha] "f"(-1), [eight] "f"(0x8)
- : "memory");
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [yuy2_ptr]"r"(src_yuy2), [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00),
+ [ushu]"f"(0xA0), [vshu]"f"(0xf5),
+ [alpha]"f"(-1), [eight]"f"(0x8)
+ : "memory"
+ );
}
void UYVYToARGBRow_MMI(const uint8_t* src_uyvy,
@@ -7460,83 +7630,90 @@ void UYVYToARGBRow_MMI(const uint8_t* src_uyvy,
int width) {
uint64_t y, u, v;
uint64_t b_vec, g_vec, r_vec, temp;
- uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
__asm__ volatile(
- "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
- "or %[ub], %[ub], %[mask1] \n\t"
- "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[ug], %[ug], %[zero] \n\t"
- "pshufh %[ug], %[ug], %[zero] \n\t"
- "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vg], %[vg], %[zero] \n\t"
- "pshufh %[vg], %[vg], %[five] \n\t"
- "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vr], %[vr], %[zero] \n\t"
- "pshufh %[vr], %[vr], %[five] \n\t"
- "or %[vr], %[vr], %[mask1] \n\t"
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask1] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask1] \n\t"
- "1: \n\t"
- "gsldlc1 %[y], 0x07(%[uyvy_ptr]) \n\t"
- "gsldrc1 %[y], 0x00(%[uyvy_ptr]) \n\t"
- "psrlh %[temp], %[mask1], %[eight] \n\t"
- "and %[temp], %[y], %[temp] \n\t"
- "pshufh %[u], %[temp], %[ushu] \n\t"
- "pshufh %[v], %[temp], %[vshu] \n\t"
+ "1: \n\t"
+ "gsldlc1 %[y], 0x07(%[uyvy_ptr]) \n\t"
+ "gsldrc1 %[y], 0x00(%[uyvy_ptr]) \n\t"
+ "psrlh %[temp], %[mask1], %[eight] \n\t"
+ "and %[temp], %[y], %[temp] \n\t"
+ "pshufh %[u], %[temp], %[ushu] \n\t"
+ "pshufh %[v], %[temp], %[vshu] \n\t"
- "psrlh %[y], %[y], %[eight] \n\t"
- "psllh %[temp], %[y], %[eight] \n\t"
- "or %[y], %[y], %[temp] \n\t"
- "pmulhuh %[y], %[y], %[yg] \n\t"
+ "psrlh %[y], %[y], %[eight] \n\t"
+ "psllh %[temp], %[y], %[eight] \n\t"
+ "or %[y], %[y], %[temp] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
- "paddsh %[b_vec], %[y], %[bb] \n\t"
- "pmullh %[temp], %[u], %[ub] \n\t"
- "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
- "psrah %[b_vec], %[b_vec], %[six] \n\t"
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
- "paddsh %[g_vec], %[y], %[bg] \n\t"
- "pmullh %[temp], %[u], %[ug] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "pmullh %[temp], %[v], %[vg] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "psrah %[g_vec], %[g_vec], %[six] \n\t"
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
- "paddsh %[r_vec], %[y], %[br] \n\t"
- "pmullh %[temp], %[v], %[vr] \n\t"
- "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
- "psrah %[r_vec], %[r_vec], %[six] \n\t"
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
- "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
- "packushb %[g_vec], %[g_vec], %[zero] \n\t"
- "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t"
- "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
- "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
- "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
- "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t"
+ "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
+ "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
- "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t"
- "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t"
- "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t"
- "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t"
+ "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t"
- "daddiu %[uyvy_ptr], %[uyvy_ptr], 0x08 \n\t"
- "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
+ "daddiu %[uyvy_ptr], %[uyvy_ptr], 0x08 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
- : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
- [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
- [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
- [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
- : [uyvy_ptr] "r"(src_uyvy), [rgbbuf_ptr] "r"(rgb_buf),
- [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00),
- [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00),
- [ushu] "f"(0xA0), [vshu] "f"(0xf5), [alpha] "f"(-1), [eight] "f"(0x8)
- : "memory");
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [uyvy_ptr]"r"(src_uyvy), [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00),
+ [ushu]"f"(0xA0), [vshu]"f"(0xf5),
+ [alpha]"f"(-1), [eight]"f"(0x8)
+ : "memory"
+ );
}
void I422ToRGBARow_MMI(const uint8_t* src_y,
@@ -7547,105 +7724,114 @@ void I422ToRGBARow_MMI(const uint8_t* src_y,
int width) {
uint64_t y, u, v;
uint64_t b_vec, g_vec, r_vec, temp;
- uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
__asm__ volatile(
- "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
- "or %[ub], %[ub], %[mask1] \n\t"
- "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[ug], %[ug], %[zero] \n\t"
- "pshufh %[ug], %[ug], %[zero] \n\t"
- "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vg], %[vg], %[zero] \n\t"
- "pshufh %[vg], %[vg], %[five] \n\t"
- "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vr], %[vr], %[zero] \n\t"
- "pshufh %[vr], %[vr], %[five] \n\t"
- "or %[vr], %[vr], %[mask1] \n\t"
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask1] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask1] \n\t"
- "1: \n\t"
- "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
- "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
- "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
- "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
- "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
- "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
+ "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
+ "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
- "punpcklbh %[y], %[y], %[y] \n\t"
- "pmulhuh %[y], %[y], %[yg] \n\t"
+ "punpcklbh %[y], %[y], %[y] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
- "punpcklbh %[u], %[u], %[u] \n\t"
- "punpcklbh %[u], %[u], %[zero] \n\t"
- "paddsh %[b_vec], %[y], %[bb] \n\t"
- "pmullh %[temp], %[u], %[ub] \n\t"
- "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
- "psrah %[b_vec], %[b_vec], %[six] \n\t"
+ "punpcklbh %[u], %[u], %[u] \n\t"
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
- "punpcklbh %[v], %[v], %[v] \n\t"
- "punpcklbh %[v], %[v], %[zero] \n\t"
- "paddsh %[g_vec], %[y], %[bg] \n\t"
- "pmullh %[temp], %[u], %[ug] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "pmullh %[temp], %[v], %[vg] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "psrah %[g_vec], %[g_vec], %[six] \n\t"
+ "punpcklbh %[v], %[v], %[v] \n\t"
+ "punpcklbh %[v], %[v], %[zero] \n\t"
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
- "paddsh %[r_vec], %[y], %[br] \n\t"
- "pmullh %[temp], %[v], %[vr] \n\t"
- "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
- "psrah %[r_vec], %[r_vec], %[six] \n\t"
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
- "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
- "packushb %[g_vec], %[g_vec], %[zero] \n\t"
- "punpcklwd %[g_vec], %[alpha], %[g_vec] \n\t"
- "punpcklbh %[b_vec], %[g_vec], %[r_vec] \n\t"
- "punpckhbh %[r_vec], %[g_vec], %[r_vec] \n\t"
- "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
- "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklwd %[g_vec], %[alpha], %[g_vec] \n\t"
+ "punpcklbh %[b_vec], %[g_vec], %[r_vec] \n\t"
+ "punpckhbh %[r_vec], %[g_vec], %[r_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
- "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t"
- "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t"
- "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t"
- "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t"
+ "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t"
- "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
- "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t"
- "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t"
- "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t"
+ "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
- : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
- [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
- [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
- [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
- : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v),
- [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants),
- [width] "r"(width), [zero] "f"(0x00), [five] "f"(0x55), [six] "f"(0x6),
- [mask1] "f"(0xff00ff00ff00ff00), [alpha] "f"(-1)
- : "memory");
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u),
+ [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00),
+ [alpha]"f"(-1)
+ : "memory"
+ );
}
void ARGBSetRow_MMI(uint8_t* dst_argb, uint32_t v32, int width) {
- __asm__ volatile(
- "punpcklwd %[v32], %[v32], %[v32] \n\t"
- "1: \n\t"
- "gssdlc1 %[v32], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[v32], 0x00(%[dst_ptr]) \n\t"
- "gssdlc1 %[v32], 0x0f(%[dst_ptr]) \n\t"
- "gssdrc1 %[v32], 0x08(%[dst_ptr]) \n\t"
+ __asm__ volatile (
+ "punpcklwd %[v32], %[v32], %[v32] \n\t"
+ "1: \n\t"
+ "gssdlc1 %[v32], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[v32], 0x00(%[dst_ptr]) \n\t"
+ "gssdlc1 %[v32], 0x0f(%[dst_ptr]) \n\t"
+ "gssdrc1 %[v32], 0x08(%[dst_ptr]) \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
- "bnez %[width], 1b \n\t"
- : [v32] "+&f"(v32)
- : [dst_ptr] "r"(dst_argb), [width] "r"(width)
- : "memory");
+ "daddi %[width], %[width], -0x04 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [v32]"+&f"(v32)
+ : [dst_ptr]"r"(dst_argb), [width]"r"(width)
+ : "memory"
+ );
}
+// clang-format on
// 10 bit YUV to ARGB
#endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
diff --git a/TMessagesProj/jni/third_party/libyuv/source/row_msa.cc b/TMessagesProj/jni/third_party/libyuv/source/row_msa.cc
index 5c0239a37..fe6df93a6 100644
--- a/TMessagesProj/jni/third_party/libyuv/source/row_msa.cc
+++ b/TMessagesProj/jni/third_party/libyuv/source/row_msa.cc
@@ -155,11 +155,10 @@ extern "C" {
}
// Loads current and next row of ARGB input and averages it to calculate U and V
-#define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3) \
+#define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3, const_0x0101) \
{ \
v16u8 src0_m, src1_m, src2_m, src3_m, src4_m, src5_m, src6_m, src7_m; \
v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
- v16u8 vec8_m, vec9_m; \
v8u16 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m, reg5_m, reg6_m, reg7_m; \
v8u16 reg8_m, reg9_m; \
\
@@ -195,81 +194,81 @@ extern "C" {
reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \
reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \
reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \
- reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2); \
- reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2); \
- reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2); \
- reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2); \
- argb0 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m); \
- argb1 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \
- src0_m = (v16u8)__msa_ld_b((void*)s, 64); \
- src1_m = (v16u8)__msa_ld_b((void*)s, 80); \
- src2_m = (v16u8)__msa_ld_b((void*)s, 96); \
- src3_m = (v16u8)__msa_ld_b((void*)s, 112); \
- src4_m = (v16u8)__msa_ld_b((void*)t, 64); \
- src5_m = (v16u8)__msa_ld_b((void*)t, 80); \
- src6_m = (v16u8)__msa_ld_b((void*)t, 96); \
- src7_m = (v16u8)__msa_ld_b((void*)t, 112); \
- vec2_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m); \
- vec3_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m); \
- vec4_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m); \
- vec5_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m); \
- vec6_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m); \
- vec7_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m); \
- vec8_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m); \
- vec9_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m); \
- reg0_m = __msa_hadd_u_h(vec2_m, vec2_m); \
- reg1_m = __msa_hadd_u_h(vec3_m, vec3_m); \
- reg2_m = __msa_hadd_u_h(vec4_m, vec4_m); \
- reg3_m = __msa_hadd_u_h(vec5_m, vec5_m); \
- reg4_m = __msa_hadd_u_h(vec6_m, vec6_m); \
- reg5_m = __msa_hadd_u_h(vec7_m, vec7_m); \
- reg6_m = __msa_hadd_u_h(vec8_m, vec8_m); \
- reg7_m = __msa_hadd_u_h(vec9_m, vec9_m); \
- reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m); \
- reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m); \
- reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m); \
- reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m); \
- reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m); \
- reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \
- reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \
- reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \
- reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2); \
- reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2); \
- reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2); \
- reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2); \
- argb2 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m); \
- argb3 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \
+ reg8_m += const_0x0101; \
+ reg9_m += const_0x0101; \
+ reg0_m += const_0x0101; \
+ reg1_m += const_0x0101; \
+ argb0 = (v8u16)__msa_srai_h((v8i16)reg8_m, 1); \
+ argb1 = (v8u16)__msa_srai_h((v8i16)reg9_m, 1); \
+ argb2 = (v8u16)__msa_srai_h((v8i16)reg0_m, 1); \
+ argb3 = (v8u16)__msa_srai_h((v8i16)reg1_m, 1); \
+ }
+
+#define ARGBTOUV(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \
+ shf0, shf1, shf2, shf3, shift, u_out, v_out) \
+ { \
+ v8u16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+ v4u32 reg0_m, reg1_m, reg2_m, reg3_m; \
+ \
+ vec0_m = (v8u16)__msa_vshf_h(shf0, (v16i8)argb1, (v16i8)argb0); \
+ vec1_m = (v8u16)__msa_vshf_h(shf0, (v16i8)argb3, (v16i8)argb2); \
+ vec2_m = (v8u16)__msa_vshf_h(shf1, (v16i8)argb1, (v16i8)argb0); \
+ vec3_m = (v8u16)__msa_vshf_h(shf1, (v16i8)argb3, (v16i8)argb2); \
+ vec4_m = (v8u16)__msa_vshf_h(shf2, (v16i8)argb1, (v16i8)argb0); \
+ vec5_m = (v8u16)__msa_vshf_h(shf2, (v16i8)argb3, (v16i8)argb2); \
+ vec6_m = (v8u16)__msa_vshf_h(shf3, (v16i8)argb1, (v16i8)argb0); \
+ vec7_m = (v8u16)__msa_vshf_h(shf3, (v16i8)argb3, (v16i8)argb2); \
+ reg0_m = __msa_dotp_u_w(vec0_m, const0); \
+ reg1_m = __msa_dotp_u_w(vec1_m, const0); \
+ reg2_m = __msa_dotp_u_w(vec4_m, const0); \
+ reg3_m = __msa_dotp_u_w(vec5_m, const0); \
+ reg0_m += const1; \
+ reg1_m += const1; \
+ reg2_m += const1; \
+ reg3_m += const1; \
+ reg0_m -= (v4u32)__msa_dotp_u_w(vec2_m, const2); \
+ reg1_m -= (v4u32)__msa_dotp_u_w(vec3_m, const2); \
+ reg2_m -= (v4u32)__msa_dotp_u_w(vec6_m, const3); \
+ reg3_m -= (v4u32)__msa_dotp_u_w(vec7_m, const3); \
+ reg0_m = __msa_srl_w(reg0_m, shift); \
+ reg1_m = __msa_srl_w(reg1_m, shift); \
+ reg2_m = __msa_srl_w(reg2_m, shift); \
+ reg3_m = __msa_srl_w(reg3_m, shift); \
+ u_out = (v8u16)__msa_pckev_h((v8i16)reg1_m, (v8i16)reg0_m); \
+ v_out = (v8u16)__msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \
}
// Takes ARGB input and calculates U and V.
-#define ARGBTOUV(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \
- shf0, shf1, shf2, shf3, v_out, u_out) \
- { \
- v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
- v8u16 reg0_m, reg1_m, reg2_m, reg3_m; \
- \
- vec0_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb1, (v16i8)argb0); \
- vec1_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb3, (v16i8)argb2); \
- vec2_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb1, (v16i8)argb0); \
- vec3_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb3, (v16i8)argb2); \
- vec4_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb1, (v16i8)argb0); \
- vec5_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb3, (v16i8)argb2); \
- vec6_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb1, (v16i8)argb0); \
- vec7_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb3, (v16i8)argb2); \
- reg0_m = __msa_dotp_u_h(vec0_m, const1); \
- reg1_m = __msa_dotp_u_h(vec1_m, const1); \
- reg2_m = __msa_dotp_u_h(vec4_m, const1); \
- reg3_m = __msa_dotp_u_h(vec5_m, const1); \
- reg0_m += const3; \
- reg1_m += const3; \
- reg2_m += const3; \
- reg3_m += const3; \
- reg0_m -= __msa_dotp_u_h(vec2_m, const0); \
- reg1_m -= __msa_dotp_u_h(vec3_m, const0); \
- reg2_m -= __msa_dotp_u_h(vec6_m, const2); \
- reg3_m -= __msa_dotp_u_h(vec7_m, const2); \
- v_out = (v16u8)__msa_pckod_b((v16i8)reg1_m, (v16i8)reg0_m); \
- u_out = (v16u8)__msa_pckod_b((v16i8)reg3_m, (v16i8)reg2_m); \
+#define ARGBTOUV_H(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \
+ shf0, shf1, shf2, shf3, v_out, u_out) \
+ { \
+ v8u16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+ v4u32 reg0_m, reg1_m, reg2_m, reg3_m; \
+ \
+ vec0_m = __msa_vshf_h(shf0, (v16i8)argb1, (v16i8)argb0); \
+ vec1_m = __msa_vshf_h(shf0, (v16i8)argb3, (v16i8)argb2); \
+ vec2_m = __msa_vshf_h(shf1, (v16i8)argb1, (v16i8)argb0); \
+ vec3_m = __msa_vshf_h(shf1, (v16i8)argb3, (v16i8)argb2); \
+ vec4_m = __msa_vshf_h(shf2, (v16i8)argb1, (v16i8)argb0); \
+ vec5_m = __msa_vshf_h(shf2, (v16i8)argb3, (v16i8)argb2); \
+ vec6_m = __msa_vshf_h(shf3, (v16i8)argb1, (v16i8)argb0); \
+ vec7_m = __msa_vshf_h(shf3, (v16i8)argb3, (v16i8)argb2); \
+ reg0_m = __msa_dotp_u_w(vec0_m, const1); \
+ reg1_m = __msa_dotp_u_w(vec1_m, const1); \
+ reg2_m = __msa_dotp_u_w(vec4_m, const1); \
+ reg3_m = __msa_dotp_u_w(vec5_m, const1); \
+ reg0_m += (v4u32)const3; \
+ reg1_m += (v4u32)const3; \
+ reg2_m += (v4u32)const3; \
+ reg3_m += (v4u32)const3; \
+ reg0_m -= __msa_dotp_u_w(vec2_m, const0); \
+ reg1_m -= __msa_dotp_u_w(vec3_m, const0); \
+ reg2_m -= __msa_dotp_u_w(vec6_m, const2); \
+ reg3_m -= __msa_dotp_u_w(vec7_m, const2); \
+ u_out = (v16u8)__msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \
+ v_out = (v16u8)__msa_pckev_h((v8i16)reg1_m, (v8i16)reg0_m); \
+ u_out = (v16u8)__msa_pckod_b((v16i8)u_out, (v16i8)u_out); \
+ v_out = (v16u8)__msa_pckod_b((v16i8)v_out, (v16i8)v_out); \
}
// Load I444 pixel data
@@ -302,6 +301,20 @@ void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) {
}
}
+void MirrorUVRow_MSA(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+ int x;
+ v8u16 src, dst;
+ v8u16 shuffler = {7, 6, 5, 4, 3, 2, 1, 0};
+ src_uv += (width - 8) << 1;
+ for (x = 0; x < width; x += 8) {
+ src = LD_UH(src_uv);
+ dst = __msa_vshf_h(shuffler, src, src);
+ ST_UH(dst, dst_uv);
+ src_uv -= 16;
+ dst_uv += 16;
+ }
+}
+
void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) {
int x;
v16u8 src0, src1, src2, src3;
@@ -825,12 +838,13 @@ void ARGBToUVRow_MSA(const uint8_t* src_argb0,
v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
v16u8 dst0, dst1;
- v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
- v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
- v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
- v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
- v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
+ v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x38);
+ v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x25);
+ v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x13);
+ v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x2f);
+ v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x09);
v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+ v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
for (x = 0; x < width; x += 32) {
src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0);
@@ -889,12 +903,18 @@ void ARGBToUVRow_MSA(const uint8_t* src_argb0,
reg3 += __msa_hadd_u_h(vec5, vec5);
reg4 += __msa_hadd_u_h(vec0, vec0);
reg5 += __msa_hadd_u_h(vec1, vec1);
- reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 2);
- reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 2);
- reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 2);
- reg3 = (v8u16)__msa_srai_h((v8i16)reg3, 2);
- reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 2);
- reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 2);
+ reg0 += const_0x0001;
+ reg1 += const_0x0001;
+ reg2 += const_0x0001;
+ reg3 += const_0x0001;
+ reg4 += const_0x0001;
+ reg5 += const_0x0001;
+ reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 1);
+ reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 1);
+ reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 1);
+ reg3 = (v8u16)__msa_srai_h((v8i16)reg3, 1);
+ reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 1);
+ reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 1);
reg6 = reg0 * const_0x70;
reg7 = reg1 * const_0x70;
reg8 = reg2 * const_0x4A;
@@ -1412,17 +1432,17 @@ void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
int x;
v16u8 src0, src1, vec0, vec1, dst0, dst1;
v8u16 reg0;
- v16u8 const_0x26 = (v16u8)__msa_ldi_h(0x26);
- v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F);
+ v16u8 const_0x4D = (v16u8)__msa_ldi_h(0x4D);
+ v16u8 const_0x961D = (v16u8)__msa_fill_h(0x961D);
for (x = 0; x < width; x += 8) {
src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0);
vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0);
- reg0 = __msa_dotp_u_h(vec0, const_0x4B0F);
- reg0 = __msa_dpadd_u_h(reg0, vec1, const_0x26);
- reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 7);
+ reg0 = __msa_dotp_u_h(vec0, const_0x961D);
+ reg0 = __msa_dpadd_u_h(reg0, vec1, const_0x4D);
+ reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 8);
vec0 = (v16u8)__msa_ilvev_b((v16i8)reg0, (v16i8)reg0);
vec1 = (v16u8)__msa_ilvod_b((v16i8)vec1, (v16i8)vec0);
dst0 = (v16u8)__msa_ilvr_b((v16i8)vec1, (v16i8)vec0);
@@ -2031,12 +2051,13 @@ void RGB24ToUVRow_MSA(const uint8_t* src_rgb0,
v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v8i16 reg0, reg1, reg2, reg3;
v16u8 dst0;
- v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70);
- v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A);
- v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26);
- v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E);
- v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12);
+ v8u16 const_0x70 = (v8u16)__msa_fill_h(0x38);
+ v8u16 const_0x4A = (v8u16)__msa_fill_h(0x25);
+ v8u16 const_0x26 = (v8u16)__msa_fill_h(0x13);
+ v8u16 const_0x5E = (v8u16)__msa_fill_h(0x2f);
+ v8u16 const_0x12 = (v8u16)__msa_fill_h(0x09);
v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+ v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
v16i8 zero = {0};
@@ -2085,10 +2106,14 @@ void RGB24ToUVRow_MSA(const uint8_t* src_rgb0,
reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2);
reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4);
reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6);
- reg0 = __msa_srai_h((v8i16)reg0, 2);
- reg1 = __msa_srai_h((v8i16)reg1, 2);
- reg2 = __msa_srai_h((v8i16)reg2, 2);
- reg3 = __msa_srai_h((v8i16)reg3, 2);
+ reg0 += const_0x0001;
+ reg1 += const_0x0001;
+ reg2 += const_0x0001;
+ reg3 += const_0x0001;
+ reg0 = __msa_srai_h((v8i16)reg0, 1);
+ reg1 = __msa_srai_h((v8i16)reg1, 1);
+ reg2 = __msa_srai_h((v8i16)reg2, 1);
+ reg3 = __msa_srai_h((v8i16)reg3, 1);
vec4 = (v8u16)__msa_pckev_h(reg1, reg0);
vec5 = (v8u16)__msa_pckev_h(reg3, reg2);
vec6 = (v8u16)__msa_pckod_h(reg1, reg0);
@@ -2136,12 +2161,13 @@ void RAWToUVRow_MSA(const uint8_t* src_rgb0,
v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v8i16 reg0, reg1, reg2, reg3;
v16u8 dst0;
- v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70);
- v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A);
- v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26);
- v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E);
- v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12);
+ v8u16 const_0x70 = (v8u16)__msa_fill_h(0x38);
+ v8u16 const_0x4A = (v8u16)__msa_fill_h(0x25);
+ v8u16 const_0x26 = (v8u16)__msa_fill_h(0x13);
+ v8u16 const_0x5E = (v8u16)__msa_fill_h(0x2f);
+ v8u16 const_0x12 = (v8u16)__msa_fill_h(0x09);
v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+ v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
v16i8 zero = {0};
@@ -2190,10 +2216,14 @@ void RAWToUVRow_MSA(const uint8_t* src_rgb0,
reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2);
reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4);
reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6);
- reg0 = __msa_srai_h(reg0, 2);
- reg1 = __msa_srai_h(reg1, 2);
- reg2 = __msa_srai_h(reg2, 2);
- reg3 = __msa_srai_h(reg3, 2);
+ reg0 += const_0x0001;
+ reg1 += const_0x0001;
+ reg2 += const_0x0001;
+ reg3 += const_0x0001;
+ reg0 = __msa_srai_h(reg0, 1);
+ reg1 = __msa_srai_h(reg1, 1);
+ reg2 = __msa_srai_h(reg2, 1);
+ reg3 = __msa_srai_h(reg3, 1);
vec4 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
vec5 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
vec6 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
@@ -2419,16 +2449,16 @@ void SobelXYRow_MSA(const uint8_t* src_sobelx,
void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
int x;
v16u8 src0, src1, src2, src3, dst0;
- v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F);
- v16u8 const_0x26 = (v16u8)__msa_fill_h(0x26);
- v8u16 const_0x40 = (v8u16)__msa_fill_h(0x40);
+ v16u8 const_0x961D = (v16u8)__msa_fill_h(0x961D);
+ v16u8 const_0x4D = (v16u8)__msa_fill_h(0x4D);
+ v8u16 const_0x80 = (v8u16)__msa_fill_h(0x80);
for (x = 0; x < width; x += 16) {
src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48);
- ARGBTOY(src0, src1, src2, src3, const_0x4B0F, const_0x26, const_0x40, 7,
+ ARGBTOY(src0, src1, src2, src3, const_0x961D, const_0x4D, const_0x80, 8,
dst0);
ST_UB(dst0, dst_y);
src_argb0 += 64;
@@ -2504,61 +2534,123 @@ void ARGBToUVJRow_MSA(const uint8_t* src_rgb0,
int x;
const uint8_t* s = src_rgb0;
const uint8_t* t = src_rgb0 + src_stride_rgb;
- v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
- v16u8 vec0, vec1, vec2, vec3;
- v16u8 dst0, dst1;
- v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
- v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15,
- 18, 19, 22, 23, 26, 27, 30, 31};
- v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
- v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30};
- v16u8 const_0x7F = (v16u8)__msa_fill_h(0x7F);
- v16u8 const_0x6B14 = (v16u8)__msa_fill_h(0x6B14);
- v16u8 const_0x2B54 = (v16u8)__msa_fill_h(0x2B54);
- v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+ v8u16 src0, src1, src2, src3, src4, src5, src6, src7;
+ v8u16 vec0, vec1, vec2, vec3;
+ v8u16 dst0, dst1, dst2, dst3;
+ v16u8 zero = {0};
+ v8i16 shuffler0 = {0, 3, 4, 7, 8, 11, 12, 15};
+ v8i16 shuffler1 = {1, 2, 5, 6, 9, 10, 13, 14};
+ v8i16 shuffler2 = {2, 3, 6, 7, 10, 11, 14, 15};
+ v8i16 shuffler3 = {0, 1, 4, 5, 8, 9, 12, 13};
+ v8u16 const_0x0000003f = (v8u16)__msa_fill_w(0x0000003f);
+ v4u32 const_0x00008080 = (v8u16)__msa_fill_w(0x00008080);
+ v8u16 const_0x0015002a = (v8u16)__msa_fill_w(0x0015002a);
+ v8u16 const_0x0035000a = (v8u16)__msa_fill_w(0x0035000a);
+ v4i32 shift = __msa_fill_w(0x00000008);
for (x = 0; x < width; x += 32) {
- src0 = (v16u8)__msa_ld_b((void*)s, 0);
- src1 = (v16u8)__msa_ld_b((void*)s, 16);
- src2 = (v16u8)__msa_ld_b((void*)s, 32);
- src3 = (v16u8)__msa_ld_b((void*)s, 48);
- src4 = (v16u8)__msa_ld_b((void*)t, 0);
- src5 = (v16u8)__msa_ld_b((void*)t, 16);
- src6 = (v16u8)__msa_ld_b((void*)t, 32);
- src7 = (v16u8)__msa_ld_b((void*)t, 48);
- src0 = __msa_aver_u_b(src0, src4);
- src1 = __msa_aver_u_b(src1, src5);
- src2 = __msa_aver_u_b(src2, src6);
- src3 = __msa_aver_u_b(src3, src7);
- src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
- src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2);
- src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
- src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2);
- vec0 = __msa_aver_u_b(src4, src6);
- vec1 = __msa_aver_u_b(src5, src7);
- src0 = (v16u8)__msa_ld_b((void*)s, 64);
- src1 = (v16u8)__msa_ld_b((void*)s, 80);
- src2 = (v16u8)__msa_ld_b((void*)s, 96);
- src3 = (v16u8)__msa_ld_b((void*)s, 112);
- src4 = (v16u8)__msa_ld_b((void*)t, 64);
- src5 = (v16u8)__msa_ld_b((void*)t, 80);
- src6 = (v16u8)__msa_ld_b((void*)t, 96);
- src7 = (v16u8)__msa_ld_b((void*)t, 112);
- src0 = __msa_aver_u_b(src0, src4);
- src1 = __msa_aver_u_b(src1, src5);
- src2 = __msa_aver_u_b(src2, src6);
- src3 = __msa_aver_u_b(src3, src7);
- src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
- src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2);
- src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
- src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2);
- vec2 = __msa_aver_u_b(src4, src6);
- vec3 = __msa_aver_u_b(src5, src7);
- ARGBTOUV(vec0, vec1, vec2, vec3, const_0x6B14, const_0x7F, const_0x2B54,
- const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0,
- dst1);
- ST_UB(dst0, dst_v);
- ST_UB(dst1, dst_u);
+ src1 = __msa_ld_b((void*)s, 0);
+ src3 = __msa_ld_b((void*)s, 16);
+ src5 = __msa_ld_b((void*)t, 0);
+ src7 = __msa_ld_b((void*)t, 16);
+ src0 = __msa_ilvr_b(zero, src1);
+ src1 = __msa_ilvl_b(zero, src1);
+ src2 = __msa_ilvr_b(zero, src3);
+ src3 = __msa_ilvl_b(zero, src3);
+ src4 = __msa_ilvr_b(zero, src5);
+ src5 = __msa_ilvl_b(zero, src5);
+ src6 = __msa_ilvr_b(zero, src7);
+ src7 = __msa_ilvl_b(zero, src7);
+ src0 += src4;
+ src1 += src5;
+ src2 += src6;
+ src3 += src7;
+ src4 = __msa_ilvev_d(src1, src0);
+ src5 = __msa_ilvod_d(src1, src0);
+ src6 = __msa_ilvev_d(src3, src2);
+ src7 = __msa_ilvod_d(src3, src2);
+ vec0 = __msa_aver_u_h(src4, src5);
+ vec1 = __msa_aver_u_h(src6, src7);
+
+ src1 = __msa_ld_b((void*)s, 32);
+ src3 = __msa_ld_b((void*)s, 48);
+ src5 = __msa_ld_b((void*)t, 32);
+ src7 = __msa_ld_b((void*)t, 48);
+ src0 = __msa_ilvr_b(zero, src1);
+ src1 = __msa_ilvl_b(zero, src1);
+ src2 = __msa_ilvr_b(zero, src3);
+ src3 = __msa_ilvl_b(zero, src3);
+ src4 = __msa_ilvr_b(zero, src5);
+ src5 = __msa_ilvl_b(zero, src5);
+ src6 = __msa_ilvr_b(zero, src7);
+ src7 = __msa_ilvl_b(zero, src7);
+ src0 += src4;
+ src1 += src5;
+ src2 += src6;
+ src3 += src7;
+ src4 = __msa_ilvev_d(src1, src0);
+ src5 = __msa_ilvod_d(src1, src0);
+ src6 = __msa_ilvev_d(src3, src2);
+ src7 = __msa_ilvod_d(src3, src2);
+ vec2 = __msa_aver_u_h(src4, src5);
+ vec3 = __msa_aver_u_h(src6, src7);
+ ARGBTOUV(vec0, vec1, vec2, vec3, const_0x0000003f, const_0x00008080,
+ const_0x0015002a, const_0x0035000a, shuffler0, shuffler1,
+ shuffler2, shuffler3, shift, dst0, dst1);
+
+ src1 = __msa_ld_b((void*)s, 64);
+ src3 = __msa_ld_b((void*)s, 80);
+ src5 = __msa_ld_b((void*)t, 64);
+ src7 = __msa_ld_b((void*)t, 80);
+ src0 = __msa_ilvr_b(zero, src1);
+ src1 = __msa_ilvl_b(zero, src1);
+ src2 = __msa_ilvr_b(zero, src3);
+ src3 = __msa_ilvl_b(zero, src3);
+ src4 = __msa_ilvr_b(zero, src5);
+ src5 = __msa_ilvl_b(zero, src5);
+ src6 = __msa_ilvr_b(zero, src7);
+ src7 = __msa_ilvl_b(zero, src7);
+ src0 += src4;
+ src1 += src5;
+ src2 += src6;
+ src3 += src7;
+ src4 = __msa_ilvev_d(src1, src0);
+ src5 = __msa_ilvod_d(src1, src0);
+ src6 = __msa_ilvev_d(src3, src2);
+ src7 = __msa_ilvod_d(src3, src2);
+ vec0 = __msa_aver_u_h(src4, src5);
+ vec1 = __msa_aver_u_h(src6, src7);
+
+ src1 = __msa_ld_b((void*)s, 96);
+ src3 = __msa_ld_b((void*)s, 112);
+ src5 = __msa_ld_b((void*)t, 96);
+ src7 = __msa_ld_b((void*)t, 112);
+ src0 = __msa_ilvr_b(zero, src1);
+ src1 = __msa_ilvl_b(zero, src1);
+ src2 = __msa_ilvr_b(zero, src3);
+ src3 = __msa_ilvl_b(zero, src3);
+ src4 = __msa_ilvr_b(zero, src5);
+ src5 = __msa_ilvl_b(zero, src5);
+ src6 = __msa_ilvr_b(zero, src7);
+ src7 = __msa_ilvl_b(zero, src7);
+ src0 += src4;
+ src1 += src5;
+ src2 += src6;
+ src3 += src7;
+ src4 = __msa_ilvev_d(src1, src0);
+ src5 = __msa_ilvod_d(src1, src0);
+ src6 = __msa_ilvev_d(src3, src2);
+ src7 = __msa_ilvod_d(src3, src2);
+ vec2 = __msa_aver_u_h(src4, src5);
+ vec3 = __msa_aver_u_h(src6, src7);
+ ARGBTOUV(vec0, vec1, vec2, vec3, const_0x0000003f, const_0x00008080,
+ const_0x0015002a, const_0x0035000a, shuffler0, shuffler1,
+ shuffler2, shuffler3, shift, dst2, dst3);
+
+ dst0 = (v8u16)__msa_pckev_b(dst2, dst0);
+ dst1 = (v8u16)__msa_pckev_b(dst3, dst1);
+ ST_UB(dst0, dst_u);
+ ST_UB(dst1, dst_v);
s += 128;
t += 128;
dst_v += 16;
@@ -2574,28 +2666,30 @@ void BGRAToUVRow_MSA(const uint8_t* src_rgb0,
int x;
const uint8_t* s = src_rgb0;
const uint8_t* t = src_rgb0 + src_stride_rgb;
- v16u8 dst0, dst1, vec0, vec1, vec2, vec3;
- v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
- v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15,
- 18, 19, 22, 23, 26, 27, 30, 31};
- v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
- v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29};
- v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E);
- v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000);
- v16u8 const_0x264A = (v16u8)__msa_fill_h(0x264A);
- v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+ const uint8_t unused = 0xf;
+ v8u16 src0, src1, src2, src3;
+ v16u8 dst0, dst1;
+ v8i16 shuffler0 = {1, unused, 5, unused, 9, unused, 13, unused};
+ v8i16 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15};
+ v8i16 shuffler2 = {3, unused, 7, unused, 11, unused, 15, unused};
+ v8i16 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14};
+ v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f);
+ v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038);
+ v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013);
+ v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080);
+ v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
- for (x = 0; x < width; x += 32) {
- READ_ARGB(s, t, vec0, vec1, vec2, vec3);
- ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A,
- const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0,
- dst1);
- ST_UB(dst0, dst_v);
- ST_UB(dst1, dst_u);
- s += 128;
- t += 128;
- dst_v += 16;
- dst_u += 16;
+ for (x = 0; x < width; x += 16) {
+ READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001);
+ ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038,
+ const_0x250013, const_0x008080, shuffler0, shuffler1, shuffler2,
+ shuffler3, dst0, dst1);
+ *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0);
+ *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0);
+ s += 64;
+ t += 64;
+ dst_u += 8;
+ dst_v += 8;
}
}
@@ -2607,29 +2701,30 @@ void ABGRToUVRow_MSA(const uint8_t* src_rgb0,
int x;
const uint8_t* s = src_rgb0;
const uint8_t* t = src_rgb0 + src_stride_rgb;
- v16u8 src0, src1, src2, src3;
+ const uint8_t unused = 0xf;
+ v8u16 src0, src1, src2, src3;
v16u8 dst0, dst1;
- v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
- v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15,
- 18, 19, 22, 23, 26, 27, 30, 31};
- v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
- v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30};
- v16u8 const_0x4A26 = (v16u8)__msa_fill_h(0x4A26);
- v16u8 const_0x0070 = (v16u8)__msa_fill_h(0x0070);
- v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E);
- v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+ v8i16 shuffler0 = {0, unused, 4, unused, 8, unused, 12, unused};
+ v8i16 shuffler1 = {1, 2, 5, 6, 9, 10, 13, 14};
+ v8i16 shuffler2 = {2, unused, 6, unused, 10, unused, 14, unused};
+ v8i16 shuffler3 = {0, 1, 4, 5, 8, 9, 12, 13};
+ v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f);
+ v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038);
+ v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013);
+ v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080);
+ v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
- for (x = 0; x < width; x += 32) {
- READ_ARGB(s, t, src0, src1, src2, src3);
- ARGBTOUV(src0, src1, src2, src3, const_0x4A26, const_0x0070, const_0x125E,
- const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0,
- dst1);
- ST_UB(dst0, dst_u);
- ST_UB(dst1, dst_v);
- s += 128;
- t += 128;
- dst_u += 16;
- dst_v += 16;
+ for (x = 0; x < width; x += 16) {
+ READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001);
+ ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038,
+ const_0x250013, const_0x008080, shuffler0, shuffler1, shuffler2,
+ shuffler3, dst0, dst1);
+ *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0);
+ *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0);
+ s += 64;
+ t += 64;
+ dst_u += 8;
+ dst_v += 8;
}
}
@@ -2641,28 +2736,30 @@ void RGBAToUVRow_MSA(const uint8_t* src_rgb0,
int x;
const uint8_t* s = src_rgb0;
const uint8_t* t = src_rgb0 + src_stride_rgb;
- v16u8 dst0, dst1, vec0, vec1, vec2, vec3;
- v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
- v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15,
- 18, 19, 22, 23, 26, 27, 30, 31};
- v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
- v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29};
- v16u8 const_0x125E = (v16u8)__msa_fill_h(0x264A);
- v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000);
- v16u8 const_0x264A = (v16u8)__msa_fill_h(0x125E);
- v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+ const uint8_t unused = 0xf;
+ v8u16 src0, src1, src2, src3;
+ v16u8 dst0, dst1;
+ v8i16 shuffler0 = {3, unused, 7, unused, 11, unused, 15, unused};
+ v8i16 shuffler1 = {2, 1, 6, 5, 10, 9, 14, 13};
+ v8i16 shuffler2 = {1, unused, 5, unused, 9, unused, 13, unused};
+ v8i16 shuffler3 = {3, 2, 7, 6, 11, 10, 15, 14};
+ v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f);
+ v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038);
+ v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013);
+ v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080);
+ v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
- for (x = 0; x < width; x += 32) {
- READ_ARGB(s, t, vec0, vec1, vec2, vec3);
- ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A,
- const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0,
- dst1);
- ST_UB(dst0, dst_u);
- ST_UB(dst1, dst_v);
- s += 128;
- t += 128;
- dst_u += 16;
- dst_v += 16;
+ for (x = 0; x < width; x += 16) {
+ READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001);
+ ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038,
+ const_0x250013, const_0x008080, shuffler0, shuffler1, shuffler2,
+ shuffler3, dst0, dst1);
+ *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0);
+ *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0);
+ s += 64;
+ t += 64;
+ dst_u += 8;
+ dst_v += 8;
}
}
@@ -2734,13 +2831,24 @@ void I444ToARGBRow_MSA(const uint8_t* src_y,
}
}
-void I400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+// TODO - respect YuvConstants
+void I400ToARGBRow_MSA(const uint8_t* src_y,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
int x;
+#if defined(__aarch64__) || defined(__arm__)
+ int ygb = yuvconstants->kUVBiasBGR[3];
+ int yg = yuvconstants->kYToRgb[1];
+#else
+ int ygb = yuvconstants->kYBiasToRgb[0];
+ int yg = yuvconstants->kYToRgb[0];
+#endif
v16u8 src0, res0, res1, res2, res3, res4, dst0, dst1, dst2, dst3;
v8i16 vec0, vec1;
v4i32 reg0, reg1, reg2, reg3;
- v4i32 vec_yg = __msa_fill_w(0x4A35);
- v8i16 vec_ygb = __msa_fill_h(0xFB78);
+ v4i32 vec_yg = __msa_fill_w(yg);
+ v8i16 vec_ygb = __msa_fill_h(ygb);
v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
v8i16 max = __msa_ldi_h(0xFF);
v8i16 zero = {0};
@@ -3006,7 +3114,7 @@ void ARGBBlendRow_MSA(const uint8_t* src_argb0,
uint8_t* dst_argb,
int width) {
int x;
- v16u8 src0, src1, src2, src3, dst0, dst1;
+ v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v8u16 vec8, vec9, vec10, vec11, vec12, vec13;
v8u16 const_256 = (v8u16)__msa_ldi_h(256);
@@ -3051,12 +3159,12 @@ void ARGBBlendRow_MSA(const uint8_t* src_argb0,
vec9 = (v8u16)__msa_srai_h((v8i16)vec9, 8);
vec10 = (v8u16)__msa_srai_h((v8i16)vec10, 8);
vec11 = (v8u16)__msa_srai_h((v8i16)vec11, 8);
- vec0 += vec8;
- vec1 += vec9;
- vec2 += vec10;
- vec3 += vec11;
dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+ dst2 = (v16u8)__msa_pckev_b((v16i8)vec9, (v16i8)vec8);
+ dst3 = (v16u8)__msa_pckev_b((v16i8)vec11, (v16i8)vec10);
+ dst0 = (v16u8)__msa_adds_u_b(dst0, dst2);
+ dst1 = (v16u8)__msa_adds_u_b(dst1, dst3);
dst0 = __msa_bmnz_v(dst0, const_255, mask);
dst1 = __msa_bmnz_v(dst1, const_255, mask);
ST_UB2(dst0, dst1, dst_argb, 16);
@@ -3082,7 +3190,7 @@ void ARGBQuantizeRow_MSA(uint8_t* dst_argb,
v16i8 mask = {0, 1, 2, 19, 4, 5, 6, 23, 8, 9, 10, 27, 12, 13, 14, 31};
v16i8 zero = {0};
- for (x = 0; x < width; x += 8) {
+ for (x = 0; x < width; x += 16) {
src0 = (v16u8)__msa_ld_b((void*)dst_argb, 0);
src1 = (v16u8)__msa_ld_b((void*)dst_argb, 16);
src2 = (v16u8)__msa_ld_b((void*)dst_argb, 32);
@@ -3315,10 +3423,10 @@ void SetRow_MSA(uint8_t* dst, uint8_t v8, int width) {
}
}
-void MirrorUVRow_MSA(const uint8_t* src_uv,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
+void MirrorSplitUVRow_MSA(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
int x;
v16u8 src0, src1, src2, src3;
v16u8 dst0, dst1, dst2, dst3;
diff --git a/TMessagesProj/jni/third_party/libyuv/source/row_neon.cc b/TMessagesProj/jni/third_party/libyuv/source/row_neon.cc
index 1cf8eefea..a5aeaabfb 100644
--- a/TMessagesProj/jni/third_party/libyuv/source/row_neon.cc
+++ b/TMessagesProj/jni/third_party/libyuv/source/row_neon.cc
@@ -114,11 +114,11 @@ void I444ToARGBRow_NEON(const uint8_t* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
+ "vmov.u8 d23, #255 \n"
"1: \n" READYUV444 YUVTORGB
- "subs %4, %4, #8 \n"
- "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
- "bgt 1b \n"
+ "subs %4, %4, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -140,11 +140,11 @@ void I422ToARGBRow_NEON(const uint8_t* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
+ "vmov.u8 d23, #255 \n"
"1: \n" READYUV422 YUVTORGB
- "subs %4, %4, #8 \n"
- "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
- "bgt 1b \n"
+ "subs %4, %4, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -168,10 +168,10 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB
- "subs %5, %5, #8 \n"
- "vld1.8 {d23}, [%3]! \n"
- "vst4.8 {d20, d21, d22, d23}, [%4]! \n"
- "bgt 1b \n"
+ "subs %5, %5, #8 \n"
+ "vld1.8 {d23}, [%3]! \n"
+ "vst4.8 {d20, d21, d22, d23}, [%4]! \n"
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -195,10 +195,10 @@ void I422ToRGBARow_NEON(const uint8_t* src_y,
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB
- "subs %4, %4, #8 \n"
- "vmov.u8 d19, #255 \n" // YUVTORGB modified d19
- "vst4.8 {d19, d20, d21, d22}, [%3]! \n"
- "bgt 1b \n"
+ "subs %4, %4, #8 \n"
+ "vmov.u8 d19, #255 \n" // YUVTORGB modified d19
+ "vst4.8 {d19, d20, d21, d22}, [%3]! \n"
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -221,9 +221,9 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y,
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB
- "subs %4, %4, #8 \n"
- "vst3.8 {d20, d21, d22}, [%3]! \n"
- "bgt 1b \n"
+ "subs %4, %4, #8 \n"
+ "vst3.8 {d20, d21, d22}, [%3]! \n"
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -253,9 +253,9 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB
- "subs %4, %4, #8 \n" ARGBTORGB565
- "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565.
- "bgt 1b \n"
+ "subs %4, %4, #8 \n" ARGBTORGB565
+ "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565.
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -287,10 +287,10 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB
- "subs %4, %4, #8 \n"
- "vmov.u8 d23, #255 \n" ARGBTOARGB1555
- "vst1.8 {q0}, [%3]! \n" // store 8 pixels
- "bgt 1b \n"
+ "subs %4, %4, #8 \n"
+ "vmov.u8 d23, #255 \n" ARGBTOARGB1555
+ "vst1.8 {q0}, [%3]! \n" // store 8 pixels
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -321,14 +321,14 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
- "vmov.u8 d4, #0x0f \n" // vbic bits to clear
+ "vmov.u8 d4, #0x0f \n" // vbic bits to clear
"1: \n"
READYUV422 YUVTORGB
- "subs %4, %4, #8 \n"
- "vmov.u8 d23, #255 \n" ARGBTOARGB4444
- "vst1.8 {q0}, [%3]! \n" // store 8 pixels
- "bgt 1b \n"
+ "subs %4, %4, #8 \n"
+ "vmov.u8 d23, #255 \n" ARGBTOARGB4444
+ "vst1.8 {q0}, [%3]! \n" // store 8 pixels
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -342,35 +342,38 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y,
"q12", "q13", "q14", "q15");
}
-void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+void I400ToARGBRow_NEON(const uint8_t* src_y,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
asm volatile(
YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
+ "vmov.u8 d23, #255 \n"
"1: \n" READYUV400 YUVTORGB
- "subs %2, %2, #8 \n"
- "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
- "bgt 1b \n"
+ "subs %2, %2, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
- : [kUVToRB] "r"(&kYuvI601Constants.kUVToRB),
- [kUVToG] "r"(&kYuvI601Constants.kUVToG),
- [kUVBiasBGR] "r"(&kYuvI601Constants.kUVBiasBGR),
- [kYToRgb] "r"(&kYuvI601Constants.kYToRgb)
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
"q12", "q13", "q14", "q15");
}
void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
asm volatile(
- "vmov.u8 d23, #255 \n"
+ "vmov.u8 d23, #255 \n"
"1: \n"
- "vld1.8 {d20}, [%0]! \n"
- "vmov d21, d20 \n"
- "vmov d22, d20 \n"
- "subs %2, %2, #8 \n"
- "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
- "bgt 1b \n"
+ "vld1.8 {d20}, [%0]! \n"
+ "vmov d21, d20 \n"
+ "vmov d22, d20 \n"
+ "subs %2, %2, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -384,11 +387,11 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
+ "vmov.u8 d23, #255 \n"
"1: \n" READNV12 YUVTORGB
- "subs %3, %3, #8 \n"
- "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
- "bgt 1b \n"
+ "subs %3, %3, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_uv), // %1
"+r"(dst_argb), // %2
@@ -407,11 +410,11 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
+ "vmov.u8 d23, #255 \n"
"1: \n" READNV21 YUVTORGB
- "subs %3, %3, #8 \n"
- "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
- "bgt 1b \n"
+ "subs %3, %3, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_vu), // %1
"+r"(dst_argb), // %2
@@ -436,9 +439,9 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y,
"1: \n"
READNV12 YUVTORGB
- "subs %3, %3, #8 \n"
- "vst3.8 {d20, d21, d22}, [%2]! \n"
- "bgt 1b \n"
+ "subs %3, %3, #8 \n"
+ "vst3.8 {d20, d21, d22}, [%2]! \n"
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_uv), // %1
"+r"(dst_rgb24), // %2
@@ -463,9 +466,9 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y,
"1: \n"
READNV21 YUVTORGB
- "subs %3, %3, #8 \n"
- "vst3.8 {d20, d21, d22}, [%2]! \n"
- "bgt 1b \n"
+ "subs %3, %3, #8 \n"
+ "vst3.8 {d20, d21, d22}, [%2]! \n"
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_vu), // %1
"+r"(dst_rgb24), // %2
@@ -486,9 +489,9 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y,
asm volatile(
YUVTORGB_SETUP
"1: \n" READNV12 YUVTORGB
- "subs %3, %3, #8 \n" ARGBTORGB565
- "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
- "bgt 1b \n"
+ "subs %3, %3, #8 \n" ARGBTORGB565
+ "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_uv), // %1
"+r"(dst_rgb565), // %2
@@ -506,11 +509,11 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
+ "vmov.u8 d23, #255 \n"
"1: \n" READYUY2 YUVTORGB
- "subs %2, %2, #8 \n"
- "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
- "bgt 1b \n"
+ "subs %2, %2, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -527,11 +530,11 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
+ "vmov.u8 d23, #255 \n"
"1: \n" READUYVY YUVTORGB
- "subs %2, %2, #8 \n"
- "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
- "bgt 1b \n"
+ "subs %2, %2, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -550,11 +553,11 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
int width) {
asm volatile(
"1: \n"
- "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
- "subs %3, %3, #16 \n" // 16 processed per loop
- "vst1.8 {q0}, [%1]! \n" // store U
- "vst1.8 {q1}, [%2]! \n" // store V
- "bgt 1b \n"
+ "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
+ "subs %3, %3, #16 \n" // 16 processed per loop
+ "vst1.8 {q0}, [%1]! \n" // store U
+ "vst1.8 {q1}, [%2]! \n" // store V
+ "bgt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -571,11 +574,11 @@ void MergeUVRow_NEON(const uint8_t* src_u,
int width) {
asm volatile(
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load U
- "vld1.8 {q1}, [%1]! \n" // load V
- "subs %3, %3, #16 \n" // 16 processed per loop
- "vst2.8 {q0, q1}, [%2]! \n" // store 16 pairs of UV
- "bgt 1b \n"
+ "vld1.8 {q0}, [%0]! \n" // load U
+ "vld1.8 {q1}, [%1]! \n" // load V
+ "subs %3, %3, #16 \n" // 16 processed per loop
+ "vst2.8 {q0, q1}, [%2]! \n" // store 16 pairs of UV
+ "bgt 1b \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
@@ -593,13 +596,13 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
int width) {
asm volatile(
"1: \n"
- "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB
- "vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB
- "subs %4, %4, #16 \n" // 16 processed per loop
- "vst1.8 {q0}, [%1]! \n" // store R
- "vst1.8 {q1}, [%2]! \n" // store G
- "vst1.8 {q2}, [%3]! \n" // store B
- "bgt 1b \n"
+ "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB
+ "vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB
+ "subs %4, %4, #16 \n" // 16 processed per loop
+ "vst1.8 {q0}, [%1]! \n" // store R
+ "vst1.8 {q1}, [%2]! \n" // store G
+ "vst1.8 {q2}, [%3]! \n" // store B
+ "bgt 1b \n"
: "+r"(src_rgb), // %0
"+r"(dst_r), // %1
"+r"(dst_g), // %2
@@ -618,13 +621,13 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
int width) {
asm volatile(
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load R
- "vld1.8 {q1}, [%1]! \n" // load G
- "vld1.8 {q2}, [%2]! \n" // load B
- "subs %4, %4, #16 \n" // 16 processed per loop
- "vst3.8 {d0, d2, d4}, [%3]! \n" // store 8 RGB
- "vst3.8 {d1, d3, d5}, [%3]! \n" // next 8 RGB
- "bgt 1b \n"
+ "vld1.8 {q0}, [%0]! \n" // load R
+ "vld1.8 {q1}, [%1]! \n" // load G
+ "vld1.8 {q2}, [%2]! \n" // load B
+ "subs %4, %4, #16 \n" // 16 processed per loop
+ "vst3.8 {d0, d2, d4}, [%3]! \n" // store 8 RGB
+ "vst3.8 {d1, d3, d5}, [%3]! \n" // next 8 RGB
+ "bgt 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
@@ -639,10 +642,10 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
"1: \n"
- "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
- "subs %2, %2, #32 \n" // 32 processed per loop
- "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32
- "bgt 1b \n"
+ "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
+ "subs %2, %2, #32 \n" // 32 processed per loop
+ "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32
+ "bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2 // Output registers
@@ -654,11 +657,11 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
// SetRow writes 'width' bytes using an 8 bit value repeated.
void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
asm volatile(
- "vdup.8 q0, %2 \n" // duplicate 16 bytes
+ "vdup.8 q0, %2 \n" // duplicate 16 bytes
"1: \n"
- "subs %1, %1, #16 \n" // 16 bytes per loop
- "vst1.8 {q0}, [%0]! \n" // store
- "bgt 1b \n"
+ "subs %1, %1, #16 \n" // 16 bytes per loop
+ "vst1.8 {q0}, [%0]! \n" // store
+ "bgt 1b \n"
: "+r"(dst), // %0
"+r"(width) // %1
: "r"(v8) // %2
@@ -668,11 +671,11 @@ void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
// ARGBSetRow writes 'width' pixels using an 32 bit value repeated.
void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
asm volatile(
- "vdup.u32 q0, %2 \n" // duplicate 4 ints
+ "vdup.u32 q0, %2 \n" // duplicate 4 ints
"1: \n"
- "subs %1, %1, #4 \n" // 4 pixels per loop
- "vst1.8 {q0}, [%0]! \n" // store
- "bgt 1b \n"
+ "subs %1, %1, #4 \n" // 4 pixels per loop
+ "vst1.8 {q0}, [%0]! \n" // store
+ "bgt 1b \n"
: "+r"(dst), // %0
"+r"(width) // %1
: "r"(v32) // %2
@@ -682,41 +685,62 @@ void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
// Start at end of source row.
- "mov r3, #-16 \n"
- "add %0, %0, %2 \n"
- "sub %0, #16 \n"
+ "add %0, %0, %2 \n"
+ "sub %0, %0, #32 \n" // 32 bytes per loop
"1: \n"
- "vld1.8 {q0}, [%0], r3 \n" // src -= 16
- "subs %2, #16 \n" // 16 pixels per loop.
- "vrev64.8 q0, q0 \n"
- "vst1.8 {d1}, [%1]! \n" // dst += 16
- "vst1.8 {d0}, [%1]! \n"
- "bgt 1b \n"
+ "vld1.8 {q1, q2}, [%0], %3 \n" // src -= 32
+ "subs %2, #32 \n" // 32 pixels per loop.
+ "vrev64.8 q0, q2 \n"
+ "vrev64.8 q1, q1 \n"
+ "vswp d0, d1 \n"
+ "vswp d2, d3 \n"
+ "vst1.8 {q0, q1}, [%1]! \n" // dst += 32
+ "bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
- :
- : "cc", "memory", "r3", "q0");
+ : "r"(-32) // %3
+ : "cc", "memory", "q0", "q1", "q2");
}
-void MirrorUVRow_NEON(const uint8_t* src_uv,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
+void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
asm volatile(
// Start at end of source row.
- "mov r12, #-16 \n"
- "add %0, %0, %3, lsl #1 \n"
- "sub %0, #16 \n"
+ "mov r12, #-16 \n"
+ "add %0, %0, %2, lsl #1 \n"
+ "sub %0, #16 \n"
"1: \n"
- "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
- "subs %3, #8 \n" // 8 pixels per loop.
- "vrev64.8 q0, q0 \n"
- "vst1.8 {d0}, [%1]! \n" // dst += 8
- "vst1.8 {d1}, [%2]! \n"
- "bgt 1b \n"
+ "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
+ "subs %2, #8 \n" // 8 pixels per loop.
+ "vrev64.8 q0, q0 \n"
+ "vst2.8 {d0, d1}, [%1]! \n" // dst += 16
+ "bgt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_uv), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "r12", "q0");
+}
+
+void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ // Start at end of source row.
+ "mov r12, #-16 \n"
+ "add %0, %0, %3, lsl #1 \n"
+ "sub %0, #16 \n"
+
+ "1: \n"
+ "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
+ "subs %3, #8 \n" // 8 pixels per loop.
+ "vrev64.8 q0, q0 \n"
+ "vst1.8 {d0}, [%1]! \n" // dst += 8
+ "vst1.8 {d1}, [%2]! \n"
+ "bgt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -725,37 +749,57 @@ void MirrorUVRow_NEON(const uint8_t* src_uv,
: "cc", "memory", "r12", "q0");
}
-void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
asm volatile(
- // Start at end of source row.
- "mov r3, #-16 \n"
- "add %0, %0, %2, lsl #2 \n"
- "sub %0, #16 \n"
+ "add %0, %0, %2, lsl #2 \n"
+ "sub %0, #32 \n"
"1: \n"
- "vld1.8 {q0}, [%0], r3 \n" // src -= 16
- "subs %2, #4 \n" // 4 pixels per loop.
- "vrev64.32 q0, q0 \n"
- "vst1.8 {d1}, [%1]! \n" // dst += 16
- "vst1.8 {d0}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "r3", "q0");
+ "vld4.8 {d0, d1, d2, d3}, [%0], %3 \n" // src -= 32
+ "subs %2, #8 \n" // 8 pixels per loop.
+ "vrev64.8 d0, d0 \n"
+ "vrev64.8 d1, d1 \n"
+ "vrev64.8 d2, d2 \n"
+ "vrev64.8 d3, d3 \n"
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // dst += 32
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(-32) // %3
+ : "cc", "memory", "d0", "d1", "d2", "d3");
+}
+
+void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
+ uint8_t* dst_rgb24,
+ int width) {
+ src_rgb24 += width * 3 - 24;
+ asm volatile(
+ "1: \n"
+ "vld3.8 {d0, d1, d2}, [%0], %3 \n" // src -= 24
+ "subs %2, #8 \n" // 8 pixels per loop.
+ "vrev64.8 d0, d0 \n"
+ "vrev64.8 d1, d1 \n"
+ "vrev64.8 d2, d2 \n"
+ "vst3.8 {d0, d1, d2}, [%1]! \n" // dst += 24
+ "bgt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(width) // %2
+ : "r"(-24) // %3
+ : "cc", "memory", "d0", "d1", "d2");
}
void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
uint8_t* dst_argb,
int width) {
asm volatile(
- "vmov.u8 d4, #255 \n" // Alpha
+ "vmov.u8 d4, #255 \n" // Alpha
"1: \n"
- "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
+ "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -766,13 +810,13 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
asm volatile(
- "vmov.u8 d4, #255 \n" // Alpha
+ "vmov.u8 d4, #255 \n" // Alpha
"1: \n"
- "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vswp.u8 d1, d3 \n" // swap R, B
- "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
+ "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vswp.u8 d1, d3 \n" // swap R, B
+ "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -783,13 +827,13 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
asm volatile(
- "vmov.u8 d0, #255 \n" // Alpha
+ "vmov.u8 d0, #255 \n" // Alpha
"1: \n"
- "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vswp.u8 d1, d3 \n" // swap R, B
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of RGBA.
- "bgt 1b \n"
+ "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vswp.u8 d1, d3 \n" // swap R, B
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of RGBA.
+ "bgt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_rgba), // %1
"+r"(width) // %2
@@ -800,12 +844,12 @@ void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
asm volatile(
"1: \n"
- "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vswp.u8 d1, d3 \n" // swap R, B
- "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of
+ "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vswp.u8 d1, d3 \n" // swap R, B
+ "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of
// RGB24.
- "bgt 1b \n"
+ "bgt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_rgb24), // %1
"+r"(width) // %2
@@ -830,13 +874,13 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
uint8_t* dst_argb,
int width) {
asm volatile(
- "vmov.u8 d3, #255 \n" // Alpha
+ "vmov.u8 d3, #255 \n" // Alpha
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
RGB565TOARGB
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
: "+r"(src_rgb565), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -876,13 +920,13 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_argb,
int width) {
asm volatile(
- "vmov.u8 d3, #255 \n" // Alpha
+ "vmov.u8 d3, #255 \n" // Alpha
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
: "+r"(src_argb1555), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -905,13 +949,13 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
uint8_t* dst_argb,
int width) {
asm volatile(
- "vmov.u8 d3, #255 \n" // Alpha
+ "vmov.u8 d3, #255 \n" // Alpha
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
: "+r"(src_argb4444), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -925,11 +969,11 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
int width) {
asm volatile(
"1: \n"
- "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of
+ "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of
// RGB24.
- "bgt 1b \n"
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_rgb24), // %1
"+r"(width) // %2
@@ -941,11 +985,11 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
asm volatile(
"1: \n"
- "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vswp.u8 d1, d3 \n" // swap R, B
- "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW.
- "bgt 1b \n"
+ "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vswp.u8 d1, d3 \n" // swap R, B
+ "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_raw), // %1
"+r"(width) // %2
@@ -957,10 +1001,10 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
asm volatile(
"1: \n"
- "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
- "subs %2, %2, #16 \n" // 16 processed per loop.
- "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y.
- "bgt 1b \n"
+ "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
+ "subs %2, %2, #16 \n" // 16 processed per loop.
+ "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y.
+ "bgt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -972,10 +1016,10 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
asm volatile(
"1: \n"
- "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
- "subs %2, %2, #16 \n" // 16 processed per loop.
- "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y.
- "bgt 1b \n"
+ "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
+ "subs %2, %2, #16 \n" // 16 processed per loop.
+ "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y.
+ "bgt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -990,11 +1034,11 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
int width) {
asm volatile(
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
- "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
- "vst1.8 {d1}, [%1]! \n" // store 8 U.
- "vst1.8 {d3}, [%2]! \n" // store 8 V.
- "bgt 1b \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
+ "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
+ "vst1.8 {d1}, [%1]! \n" // store 8 U.
+ "vst1.8 {d3}, [%2]! \n" // store 8 V.
+ "bgt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -1010,11 +1054,11 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
int width) {
asm volatile(
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
- "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
- "vst1.8 {d0}, [%1]! \n" // store 8 U.
- "vst1.8 {d2}, [%2]! \n" // store 8 V.
- "bgt 1b \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
+ "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
+ "vst1.8 {d0}, [%1]! \n" // store 8 U.
+ "vst1.8 {d2}, [%2]! \n" // store 8 V.
+ "bgt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -1030,16 +1074,16 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
uint8_t* dst_v,
int width) {
asm volatile(
- "add %1, %0, %1 \n" // stride + src_yuy2
+ "add %1, %0, %1 \n" // stride + src_yuy2
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
- "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
- "vrhadd.u8 d1, d1, d5 \n" // average rows of U
- "vrhadd.u8 d3, d3, d7 \n" // average rows of V
- "vst1.8 {d1}, [%2]! \n" // store 8 U.
- "vst1.8 {d3}, [%3]! \n" // store 8 V.
- "bgt 1b \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
+ "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
+ "vrhadd.u8 d1, d1, d5 \n" // average rows of U
+ "vrhadd.u8 d3, d3, d7 \n" // average rows of V
+ "vst1.8 {d1}, [%2]! \n" // store 8 U.
+ "vst1.8 {d3}, [%3]! \n" // store 8 V.
+ "bgt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(stride_yuy2), // %1
"+r"(dst_u), // %2
@@ -1057,16 +1101,16 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
uint8_t* dst_v,
int width) {
asm volatile(
- "add %1, %0, %1 \n" // stride + src_uyvy
+ "add %1, %0, %1 \n" // stride + src_uyvy
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
- "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
- "vrhadd.u8 d0, d0, d4 \n" // average rows of U
- "vrhadd.u8 d2, d2, d6 \n" // average rows of V
- "vst1.8 {d0}, [%2]! \n" // store 8 U.
- "vst1.8 {d2}, [%3]! \n" // store 8 V.
- "bgt 1b \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
+ "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
+ "vrhadd.u8 d0, d0, d4 \n" // average rows of U
+ "vrhadd.u8 d2, d2, d6 \n" // average rows of V
+ "vst1.8 {d0}, [%2]! \n" // store 8 U.
+ "vst1.8 {d2}, [%3]! \n" // store 8 V.
+ "bgt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(stride_uyvy), // %1
"+r"(dst_u), // %2
@@ -1084,14 +1128,14 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb,
const uint8_t* shuffler,
int width) {
asm volatile(
- "vld1.8 {q2}, [%3] \n" // shuffler
+ "vld1.8 {q2}, [%3] \n" // shuffler
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 4 pixels.
- "subs %2, %2, #4 \n" // 4 processed per loop
- "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels
- "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels
- "vst1.8 {q1}, [%1]! \n" // store 4.
- "bgt 1b \n"
+ "vld1.8 {q0}, [%0]! \n" // load 4 pixels.
+ "subs %2, %2, #4 \n" // 4 processed per loop
+ "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels
+ "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels
+ "vst1.8 {q1}, [%1]! \n" // store 4.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -1107,12 +1151,12 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y,
int width) {
asm volatile(
"1: \n"
- "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
- "vld1.8 {d1}, [%1]! \n" // load 8 Us
- "vld1.8 {d3}, [%2]! \n" // load 8 Vs
- "subs %4, %4, #16 \n" // 16 pixels
- "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels.
- "bgt 1b \n"
+ "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
+ "vld1.8 {d1}, [%1]! \n" // load 8 Us
+ "vld1.8 {d3}, [%2]! \n" // load 8 Vs
+ "subs %4, %4, #16 \n" // 16 pixels
+ "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels.
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -1129,12 +1173,12 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y,
int width) {
asm volatile(
"1: \n"
- "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
- "vld1.8 {d0}, [%1]! \n" // load 8 Us
- "vld1.8 {d2}, [%2]! \n" // load 8 Vs
- "subs %4, %4, #16 \n" // 16 pixels
- "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels.
- "bgt 1b \n"
+ "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
+ "vld1.8 {d0}, [%1]! \n" // load 8 Us
+ "vld1.8 {d2}, [%2]! \n" // load 8 Vs
+ "subs %4, %4, #16 \n" // 16 pixels
+ "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels.
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -1149,11 +1193,11 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
int width) {
asm volatile(
"1: \n"
- "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTORGB565
- "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565.
- "bgt 1b \n"
+ "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_rgb565), // %1
"+r"(width) // %2
@@ -1166,16 +1210,16 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
const uint32_t dither4,
int width) {
asm volatile(
- "vdup.32 d2, %2 \n" // dither4
+ "vdup.32 d2, %2 \n" // dither4
"1: \n"
- "vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vqadd.u8 d20, d20, d2 \n"
- "vqadd.u8 d21, d21, d2 \n"
- "vqadd.u8 d22, d22, d2 \n" // add for dither
+ "vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 d20, d20, d2 \n"
+ "vqadd.u8 d21, d21, d2 \n"
+ "vqadd.u8 d22, d22, d2 \n" // add for dither
ARGBTORGB565
- "vst1.8 {q0}, [%0]! \n" // store 8 RGB565.
- "bgt 1b \n"
+ "vst1.8 {q0}, [%0]! \n" // store 8 RGB565.
+ "bgt 1b \n"
: "+r"(dst_rgb) // %0
: "r"(src_argb), // %1
"r"(dither4), // %2
@@ -1188,11 +1232,11 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
int width) {
asm volatile(
"1: \n"
- "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTOARGB1555
- "vst1.8 {q0}, [%1]! \n" // store 8 ARGB1555.
- "bgt 1b \n"
+ "vst1.8 {q0}, [%1]! \n" // store 8 ARGB1555.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb1555), // %1
"+r"(width) // %2
@@ -1204,14 +1248,14 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
uint8_t* dst_argb4444,
int width) {
asm volatile(
- "vmov.u8 d4, #0x0f \n" // bits to clear with
+ "vmov.u8 d4, #0x0f \n" // bits to clear with
// vbic.
"1: \n"
- "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTOARGB4444
- "vst1.8 {q0}, [%1]! \n" // store 8 ARGB4444.
- "bgt 1b \n"
+ "vst1.8 {q0}, [%1]! \n" // store 8 ARGB4444.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb4444), // %1
"+r"(width) // %2
@@ -1221,20 +1265,20 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile(
- "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
- "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
- "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
- "vmov.u8 d27, #16 \n" // Add 16 constant
+ "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
"vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d27 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "vqadd.u8 d0, d27 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1247,11 +1291,11 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
int width) {
asm volatile(
"1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels
- "subs %2, %2, #16 \n" // 16 processed per loop
- "vst1.8 {q3}, [%1]! \n" // store 16 A's.
- "bgt 1b \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "vst1.8 {q3}, [%1]! \n" // store 16 A's.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_a), // %1
"+r"(width) // %2
@@ -1262,18 +1306,18 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile(
- "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient
- "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient
- "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient
+ "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient
+ "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient
+ "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
"vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1283,18 +1327,18 @@ void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
void RGBAToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile(
- "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient
- "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient
- "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient
+ "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient
+ "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient
+ "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 RGBA pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d1, d24 \n" // B
- "vmlal.u8 q2, d2, d25 \n" // G
- "vmlal.u8 q2, d3, d26 \n" // R
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 RGBA pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d1, d24 \n" // B
+ "vmlal.u8 q2, d2, d25 \n" // G
+ "vmlal.u8 q2, d3, d26 \n" // R
"vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1308,32 +1352,32 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb,
uint8_t* dst_v,
int width) {
asm volatile(
- "vmov.u8 d24, #112 \n" // UB / VR 0.875
+ "vmov.u8 d24, #112 \n" // UB / VR 0.875
// coefficient
- "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient
- "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient
- "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient
- "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
+ "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient
+ "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient
+ "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient
+ "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlsl.u8 q2, d1, d25 \n" // G
- "vmlsl.u8 q2, d2, d26 \n" // R
- "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlsl.u8 q2, d1, d25 \n" // G
+ "vmlsl.u8 q2, d2, d26 \n" // R
+ "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned
- "vmull.u8 q3, d2, d24 \n" // R
- "vmlsl.u8 q3, d1, d28 \n" // G
- "vmlsl.u8 q3, d0, d27 \n" // B
- "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned
+ "vmull.u8 q3, d2, d24 \n" // R
+ "vmlsl.u8 q3, d1, d28 \n" // G
+ "vmlsl.u8 q3, d0, d27 \n" // B
+ "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned
"vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U
"vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -1365,34 +1409,34 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
uint8_t* dst_v,
int width) {
asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
- "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
- "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
- "subs %4, %4, #16 \n" // 32 processed per loop.
+ "subs %4, %4, #16 \n" // 16 processed per loop.
RGBTOUV(q0, q1, q2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(src_stride_argb), // %1
"+r"(dst_u), // %2
@@ -1411,34 +1455,34 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
uint8_t* dst_v,
int width) {
asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient
- "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient
- "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient
- "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient
- "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
- "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
- "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient
+ "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient
+ "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient
+ "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient
+ "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
- "subs %4, %4, #16 \n" // 32 processed per loop.
+ "subs %4, %4, #16 \n" // 16 processed per loop.
RGBTOUV(q0, q1, q2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(src_stride_argb), // %1
"+r"(dst_u), // %2
@@ -1456,34 +1500,34 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra,
uint8_t* dst_v,
int width) {
asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_bgra
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels.
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels.
- "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts.
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels.
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels.
- "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts.
+ "add %1, %0, %1 \n" // src_stride + src_bgra
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels.
+ "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels.
+ "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts.
- "vrshr.u16 q1, q1, #1 \n" // 2x average
- "vrshr.u16 q2, q2, #1 \n"
- "vrshr.u16 q3, q3, #1 \n"
+ "vrshr.u16 q1, q1, #1 \n" // 2x average
+ "vrshr.u16 q2, q2, #1 \n"
+ "vrshr.u16 q3, q3, #1 \n"
- "subs %4, %4, #16 \n" // 32 processed per loop.
+ "subs %4, %4, #16 \n" // 16 processed per loop.
RGBTOUV(q3, q2, q1)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_bgra), // %0
"+r"(src_stride_bgra), // %1
"+r"(dst_u), // %2
@@ -1501,34 +1545,34 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr,
uint8_t* dst_v,
int width) {
asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_abgr
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
- "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels.
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels.
- "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
+ "add %1, %0, %1 \n" // src_stride + src_abgr
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
+ "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels.
+ "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
- "subs %4, %4, #16 \n" // 32 processed per loop.
+ "subs %4, %4, #16 \n" // 16 processed per loop.
RGBTOUV(q2, q1, q0)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_abgr), // %0
"+r"(src_stride_abgr), // %1
"+r"(dst_u), // %2
@@ -1546,34 +1590,34 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba,
uint8_t* dst_v,
int width) {
asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_rgba
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels.
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels.
- "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts.
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels.
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels.
- "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts.
+ "add %1, %0, %1 \n" // src_stride + src_rgba
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels.
+ "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels.
+ "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts.
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
- "subs %4, %4, #16 \n" // 32 processed per loop.
+ "subs %4, %4, #16 \n" // 16 processed per loop.
RGBTOUV(q0, q1, q2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_rgba), // %0
"+r"(src_stride_rgba), // %1
"+r"(dst_u), // %2
@@ -1591,34 +1635,34 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
uint8_t* dst_v,
int width) {
asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_rgb24
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
- "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
- "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
- "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels.
- "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels.
- "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
+ "add %1, %0, %1 \n" // src_stride + src_rgb24
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
+ "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels.
+ "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
- "subs %4, %4, #16 \n" // 32 processed per loop.
+ "subs %4, %4, #16 \n" // 16 processed per loop.
RGBTOUV(q0, q1, q2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(src_stride_rgb24), // %1
"+r"(dst_u), // %2
@@ -1636,34 +1680,34 @@ void RAWToUVRow_NEON(const uint8_t* src_raw,
uint8_t* dst_v,
int width) {
asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_raw
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
- "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
- "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
- "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels.
- "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels.
- "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
+ "add %1, %0, %1 \n" // src_stride + src_raw
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
+ "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
+ "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
+ "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels.
+ "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels.
+ "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
- "subs %4, %4, #16 \n" // 32 processed per loop.
+ "subs %4, %4, #16 \n" // 16 processed per loop.
RGBTOUV(q2, q1, q0)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_raw), // %0
"+r"(src_stride_raw), // %1
"+r"(dst_u), // %2
@@ -1682,55 +1726,55 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
uint8_t* dst_v,
int width) {
asm volatile(
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
// coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
+ "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
RGB565TOARGB
- "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels.
+ "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels.
RGB565TOARGB
- "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+ "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
- "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels.
+ "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels.
RGB565TOARGB
- "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels.
+ "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels.
RGB565TOARGB
- "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+ "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
- "vrshr.u16 q4, q4, #1 \n" // 2x average
- "vrshr.u16 q5, q5, #1 \n"
- "vrshr.u16 q6, q6, #1 \n"
+ "vrshr.u16 q4, q4, #1 \n" // 2x average
+ "vrshr.u16 q5, q5, #1 \n"
+ "vrshr.u16 q6, q6, #1 \n"
- "subs %4, %4, #16 \n" // 16 processed per loop.
- "vmul.s16 q8, q4, q10 \n" // B
- "vmls.s16 q8, q5, q11 \n" // G
- "vmls.s16 q8, q6, q12 \n" // R
- "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
- "vmul.s16 q9, q6, q10 \n" // R
- "vmls.s16 q9, q5, q14 \n" // G
- "vmls.s16 q9, q4, q13 \n" // B
- "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ "vmul.s16 q8, q4, q10 \n" // B
+ "vmls.s16 q8, q5, q11 \n" // G
+ "vmls.s16 q8, q6, q12 \n" // R
+ "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
+ "vmul.s16 q9, q6, q10 \n" // R
+ "vmls.s16 q9, q5, q14 \n" // G
+ "vmls.s16 q9, q4, q13 \n" // B
+ "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
"vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
"vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_rgb565), // %0
"+r"(src_stride_rgb565), // %1
"+r"(dst_u), // %2
@@ -1748,55 +1792,55 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_v,
int width) {
asm volatile(
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
// coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
RGB555TOARGB
- "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels.
+ "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels.
RGB555TOARGB
- "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+ "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
- "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels.
+ "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels.
RGB555TOARGB
- "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels.
+ "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels.
RGB555TOARGB
- "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+ "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
- "vrshr.u16 q4, q4, #1 \n" // 2x average
- "vrshr.u16 q5, q5, #1 \n"
- "vrshr.u16 q6, q6, #1 \n"
+ "vrshr.u16 q4, q4, #1 \n" // 2x average
+ "vrshr.u16 q5, q5, #1 \n"
+ "vrshr.u16 q6, q6, #1 \n"
- "subs %4, %4, #16 \n" // 16 processed per loop.
- "vmul.s16 q8, q4, q10 \n" // B
- "vmls.s16 q8, q5, q11 \n" // G
- "vmls.s16 q8, q6, q12 \n" // R
- "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
- "vmul.s16 q9, q6, q10 \n" // R
- "vmls.s16 q9, q5, q14 \n" // G
- "vmls.s16 q9, q4, q13 \n" // B
- "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ "vmul.s16 q8, q4, q10 \n" // B
+ "vmls.s16 q8, q5, q11 \n" // G
+ "vmls.s16 q8, q6, q12 \n" // R
+ "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
+ "vmul.s16 q9, q6, q10 \n" // R
+ "vmls.s16 q9, q5, q14 \n" // G
+ "vmls.s16 q9, q4, q13 \n" // B
+ "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
"vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
"vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_argb1555), // %0
"+r"(src_stride_argb1555), // %1
"+r"(dst_u), // %2
@@ -1814,55 +1858,46 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
uint8_t* dst_v,
int width) {
asm volatile(
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
// coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
ARGB4444TOARGB
- "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels.
+ "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels.
ARGB4444TOARGB
- "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+ "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
- "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels.
+ "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels.
ARGB4444TOARGB
- "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels.
+ "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels.
ARGB4444TOARGB
- "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+ "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
- "vrshr.u16 q4, q4, #1 \n" // 2x average
- "vrshr.u16 q5, q5, #1 \n"
- "vrshr.u16 q6, q6, #1 \n"
+ "vrshr.u16 q0, q4, #1 \n" // 2x average
+ "vrshr.u16 q1, q5, #1 \n"
+ "vrshr.u16 q2, q6, #1 \n"
- "subs %4, %4, #16 \n" // 16 processed per loop.
- "vmul.s16 q8, q4, q10 \n" // B
- "vmls.s16 q8, q5, q11 \n" // G
- "vmls.s16 q8, q6, q12 \n" // R
- "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
- "vmul.s16 q9, q6, q10 \n" // R
- "vmls.s16 q9, q5, q14 \n" // G
- "vmls.s16 q9, q4, q13 \n" // B
- "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
- "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
- "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ RGBTOUV(q0, q1, q2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_argb4444), // %0
"+r"(src_stride_argb4444), // %1
"+r"(dst_u), // %2
@@ -1875,21 +1910,21 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
asm volatile(
- "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
- "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
- "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
- "vmov.u8 d27, #16 \n" // Add 16 constant
+ "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
RGB565TOARGB
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
"vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d27 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "vqadd.u8 d0, d27 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
: "+r"(src_rgb565), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1901,21 +1936,21 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_y,
int width) {
asm volatile(
- "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
- "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
- "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
- "vmov.u8 d27, #16 \n" // Add 16 constant
+ "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
"vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d27 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "vqadd.u8 d0, d27 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
: "+r"(src_argb1555), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1927,21 +1962,21 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
uint8_t* dst_y,
int width) {
asm volatile(
- "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
- "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
- "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
- "vmov.u8 d27, #16 \n" // Add 16 constant
+ "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
"vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d27 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "vqadd.u8 d0, d27 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
: "+r"(src_argb4444), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1951,20 +1986,20 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
asm volatile(
- "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient
- "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient
- "vmov.u8 d4, #66 \n" // R * 0.2578 coefficient
- "vmov.u8 d7, #16 \n" // Add 16 constant
+ "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient
+ "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient
+ "vmov.u8 d4, #66 \n" // R * 0.2578 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q8, d1, d4 \n" // R
- "vmlal.u8 q8, d2, d5 \n" // G
- "vmlal.u8 q8, d3, d6 \n" // B
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d1, d4 \n" // R
+ "vmlal.u8 q8, d2, d5 \n" // G
+ "vmlal.u8 q8, d3, d6 \n" // B
"vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d7 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "vqadd.u8 d0, d7 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
: "+r"(src_bgra), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1974,20 +2009,20 @@ void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
asm volatile(
- "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient
- "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient
- "vmov.u8 d4, #66 \n" // R * 0.2578 coefficient
- "vmov.u8 d7, #16 \n" // Add 16 constant
+ "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient
+ "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient
+ "vmov.u8 d4, #66 \n" // R * 0.2578 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q8, d0, d4 \n" // R
- "vmlal.u8 q8, d1, d5 \n" // G
- "vmlal.u8 q8, d2, d6 \n" // B
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d0, d4 \n" // R
+ "vmlal.u8 q8, d1, d5 \n" // G
+ "vmlal.u8 q8, d2, d6 \n" // B
"vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d7 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "vqadd.u8 d0, d7 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
: "+r"(src_abgr), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1997,20 +2032,20 @@ void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
asm volatile(
- "vmov.u8 d4, #25 \n" // B * 0.1016 coefficient
- "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient
- "vmov.u8 d6, #66 \n" // R * 0.2578 coefficient
- "vmov.u8 d7, #16 \n" // Add 16 constant
+ "vmov.u8 d4, #25 \n" // B * 0.1016 coefficient
+ "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient
+ "vmov.u8 d6, #66 \n" // R * 0.2578 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q8, d1, d4 \n" // B
- "vmlal.u8 q8, d2, d5 \n" // G
- "vmlal.u8 q8, d3, d6 \n" // R
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d1, d4 \n" // B
+ "vmlal.u8 q8, d2, d5 \n" // G
+ "vmlal.u8 q8, d3, d6 \n" // R
"vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d7 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "vqadd.u8 d0, d7 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
: "+r"(src_rgba), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -2020,20 +2055,20 @@ void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
asm volatile(
- "vmov.u8 d4, #25 \n" // B * 0.1016 coefficient
- "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient
- "vmov.u8 d6, #66 \n" // R * 0.2578 coefficient
- "vmov.u8 d7, #16 \n" // Add 16 constant
+ "vmov.u8 d4, #25 \n" // B * 0.1016 coefficient
+ "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient
+ "vmov.u8 d6, #66 \n" // R * 0.2578 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
"1: \n"
- "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q8, d0, d4 \n" // B
- "vmlal.u8 q8, d1, d5 \n" // G
- "vmlal.u8 q8, d2, d6 \n" // R
+ "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d0, d4 \n" // B
+ "vmlal.u8 q8, d1, d5 \n" // G
+ "vmlal.u8 q8, d2, d6 \n" // R
"vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d7 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "vqadd.u8 d0, d7 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -2043,20 +2078,20 @@ void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
asm volatile(
- "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient
- "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient
- "vmov.u8 d4, #66 \n" // R * 0.2578 coefficient
- "vmov.u8 d7, #16 \n" // Add 16 constant
+ "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient
+ "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient
+ "vmov.u8 d4, #66 \n" // R * 0.2578 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
"1: \n"
- "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q8, d0, d4 \n" // B
- "vmlal.u8 q8, d1, d5 \n" // G
- "vmlal.u8 q8, d2, d6 \n" // R
+ "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d0, d4 \n" // B
+ "vmlal.u8 q8, d1, d5 \n" // G
+ "vmlal.u8 q8, d2, d6 \n" // R
"vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d7 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "vqadd.u8 d0, d7 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -2064,6 +2099,48 @@ void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
}
+void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+ asm volatile(
+ "vmov.u8 d4, #29 \n" // B * 0.1140 coefficient
+ "vmov.u8 d5, #150 \n" // G * 0.5870 coefficient
+ "vmov.u8 d6, #77 \n" // R * 0.2990 coefficient
+ "1: \n"
+ "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q4, d0, d4 \n" // B
+ "vmlal.u8 q4, d1, d5 \n" // G
+ "vmlal.u8 q4, d2, d6 \n" // R
+ "vqrshrn.u16 d0, q4, #8 \n" // 16 bit to 8 bit Y
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_yj), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "q4");
+}
+
+void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+ asm volatile(
+ "vmov.u8 d6, #29 \n" // B * 0.1140 coefficient
+ "vmov.u8 d5, #150 \n" // G * 0.5870 coefficient
+ "vmov.u8 d4, #77 \n" // R * 0.2990 coefficient
+ "1: \n"
+ "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q4, d0, d4 \n" // B
+ "vmlal.u8 q4, d1, d5 \n" // G
+ "vmlal.u8 q4, d2, d6 \n" // R
+ "vqrshrn.u16 d0, q4, #8 \n" // 16 bit to 8 bit Y
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_yj), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "q4");
+}
+
// Bilinear filter 16x2 -> 16x1
void InterpolateRow_NEON(uint8_t* dst_ptr,
const uint8_t* src_ptr,
@@ -2072,46 +2149,46 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
int source_y_fraction) {
int y1_fraction = source_y_fraction;
asm volatile(
- "cmp %4, #0 \n"
- "beq 100f \n"
- "add %2, %1 \n"
- "cmp %4, #128 \n"
- "beq 50f \n"
+ "cmp %4, #0 \n"
+ "beq 100f \n"
+ "add %2, %1 \n"
+ "cmp %4, #128 \n"
+ "beq 50f \n"
- "vdup.8 d5, %4 \n"
- "rsb %4, #256 \n"
- "vdup.8 d4, %4 \n"
+ "vdup.8 d5, %4 \n"
+ "rsb %4, #256 \n"
+ "vdup.8 d4, %4 \n"
// General purpose row blend.
"1: \n"
- "vld1.8 {q0}, [%1]! \n"
- "vld1.8 {q1}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vmull.u8 q13, d0, d4 \n"
- "vmull.u8 q14, d1, d4 \n"
- "vmlal.u8 q13, d2, d5 \n"
- "vmlal.u8 q14, d3, d5 \n"
- "vrshrn.u16 d0, q13, #8 \n"
- "vrshrn.u16 d1, q14, #8 \n"
- "vst1.8 {q0}, [%0]! \n"
- "bgt 1b \n"
- "b 99f \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vmull.u8 q13, d0, d4 \n"
+ "vmull.u8 q14, d1, d4 \n"
+ "vmlal.u8 q13, d2, d5 \n"
+ "vmlal.u8 q14, d3, d5 \n"
+ "vrshrn.u16 d0, q13, #8 \n"
+ "vrshrn.u16 d1, q14, #8 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 1b \n"
+ "b 99f \n"
// Blend 50 / 50.
"50: \n"
- "vld1.8 {q0}, [%1]! \n"
- "vld1.8 {q1}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vrhadd.u8 q0, q1 \n"
- "vst1.8 {q0}, [%0]! \n"
- "bgt 50b \n"
- "b 99f \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 50b \n"
+ "b 99f \n"
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
- "vld1.8 {q0}, [%1]! \n"
- "subs %3, %3, #16 \n"
- "vst1.8 {q0}, [%0]! \n"
- "bgt 100b \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "subs %3, %3, #16 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 100b \n"
"99: \n"
: "+r"(dst_ptr), // %0
@@ -2129,51 +2206,51 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb0,
uint8_t* dst_argb,
int width) {
asm volatile(
- "subs %3, #8 \n"
- "blt 89f \n"
+ "subs %3, #8 \n"
+ "blt 89f \n"
// Blend 8 pixels.
"8: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0.
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vmull.u8 q10, d4, d3 \n" // db * a
- "vmull.u8 q11, d5, d3 \n" // dg * a
- "vmull.u8 q12, d6, d3 \n" // dr * a
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vmull.u8 q10, d4, d3 \n" // db * a
+ "vmull.u8 q11, d5, d3 \n" // dg * a
+ "vmull.u8 q12, d6, d3 \n" // dr * a
"vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
"vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
"vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
- "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
- "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
- "vqadd.u8 q0, q0, q2 \n" // + sbg
- "vqadd.u8 d2, d2, d6 \n" // + sr
- "vmov.u8 d3, #255 \n" // a = 255
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB.
- "bge 8b \n"
+ "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
+ "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
+ "vqadd.u8 q0, q0, q2 \n" // + sbg
+ "vqadd.u8 d2, d2, d6 \n" // + sr
+ "vmov.u8 d3, #255 \n" // a = 255
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB.
+ "bge 8b \n"
"89: \n"
- "adds %3, #8-1 \n"
- "blt 99f \n"
+ "adds %3, #8-1 \n"
+ "blt 99f \n"
// Blend 1 pixels.
"1: \n"
- "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0.
- "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1.
- "subs %3, %3, #1 \n" // 1 processed per loop.
- "vmull.u8 q10, d4, d3 \n" // db * a
- "vmull.u8 q11, d5, d3 \n" // dg * a
- "vmull.u8 q12, d6, d3 \n" // dr * a
- "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
- "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
- "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
- "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
- "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
- "vqadd.u8 q0, q0, q2 \n" // + sbg
- "vqadd.u8 d2, d2, d6 \n" // + sr
- "vmov.u8 d3, #255 \n" // a = 255
- "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel.
- "bge 1b \n"
+ "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0.
+ "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1.
+ "subs %3, %3, #1 \n" // 1 processed per loop.
+ "vmull.u8 q10, d4, d3 \n" // db * a
+ "vmull.u8 q11, d5, d3 \n" // dg * a
+ "vmull.u8 q12, d6, d3 \n" // dr * a
+ "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
+ "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
+ "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
+ "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
+ "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
+ "vqadd.u8 q0, q0, q2 \n" // + sbg
+ "vqadd.u8 d2, d2, d6 \n" // + sr
+ "vmov.u8 d3, #255 \n" // a = 255
+ "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel.
+ "bge 1b \n"
- "99: \n"
+ "99: \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
@@ -2190,16 +2267,16 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
asm volatile(
// Attenuate 8 pixels.
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q10, d0, d3 \n" // b * a
- "vmull.u8 q11, d1, d3 \n" // g * a
- "vmull.u8 q12, d2, d3 \n" // r * a
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q10, d0, d3 \n" // b * a
+ "vmull.u8 q11, d1, d3 \n" // g * a
+ "vmull.u8 q12, d2, d3 \n" // r * a
"vqrshrn.u16 d0, q10, #8 \n" // b >>= 8
"vqrshrn.u16 d1, q11, #8 \n" // g >>= 8
"vqrshrn.u16 d2, q12, #8 \n" // r >>= 8
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -2215,32 +2292,32 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
int interval_offset,
int width) {
asm volatile(
- "vdup.u16 q8, %2 \n"
- "vshr.u16 q8, q8, #1 \n" // scale >>= 1
- "vdup.u16 q9, %3 \n" // interval multiply.
- "vdup.u16 q10, %4 \n" // interval add
+ "vdup.u16 q8, %2 \n"
+ "vshr.u16 q8, q8, #1 \n" // scale >>= 1
+ "vdup.u16 q9, %3 \n" // interval multiply.
+ "vdup.u16 q10, %4 \n" // interval add
// 8 pixel loop.
"1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB.
- "subs %1, %1, #8 \n" // 8 processed per loop.
- "vmovl.u8 q0, d0 \n" // b (0 .. 255)
- "vmovl.u8 q1, d2 \n"
- "vmovl.u8 q2, d4 \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB.
+ "subs %1, %1, #8 \n" // 8 processed per loop.
+ "vmovl.u8 q0, d0 \n" // b (0 .. 255)
+ "vmovl.u8 q1, d2 \n"
+ "vmovl.u8 q2, d4 \n"
"vqdmulh.s16 q0, q0, q8 \n" // b * scale
"vqdmulh.s16 q1, q1, q8 \n" // g
"vqdmulh.s16 q2, q2, q8 \n" // r
- "vmul.u16 q0, q0, q9 \n" // b * interval_size
- "vmul.u16 q1, q1, q9 \n" // g
- "vmul.u16 q2, q2, q9 \n" // r
- "vadd.u16 q0, q0, q10 \n" // b + interval_offset
- "vadd.u16 q1, q1, q10 \n" // g
- "vadd.u16 q2, q2, q10 \n" // r
- "vqmovn.u16 d0, q0 \n"
- "vqmovn.u16 d2, q1 \n"
- "vqmovn.u16 d4, q2 \n"
- "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
+ "vmul.u16 q0, q0, q9 \n" // b * interval_size
+ "vmul.u16 q1, q1, q9 \n" // g
+ "vmul.u16 q2, q2, q9 \n" // r
+ "vadd.u16 q0, q0, q10 \n" // b + interval_offset
+ "vadd.u16 q1, q1, q10 \n" // g
+ "vadd.u16 q2, q2, q10 \n" // r
+ "vqmovn.u16 d0, q0 \n"
+ "vqmovn.u16 d2, q1 \n"
+ "vqmovn.u16 d4, q2 \n"
+ "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
: "r"(scale), // %2
@@ -2257,28 +2334,28 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
int width,
uint32_t value) {
asm volatile(
- "vdup.u32 q0, %3 \n" // duplicate scale value.
- "vzip.u8 d0, d1 \n" // d0 aarrggbb.
- "vshr.u16 q0, q0, #1 \n" // scale / 2.
+ "vdup.u32 q0, %3 \n" // duplicate scale value.
+ "vzip.u8 d0, d1 \n" // d0 aarrggbb.
+ "vshr.u16 q0, q0, #1 \n" // scale / 2.
// 8 pixel loop.
"1: \n"
- "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmovl.u8 q10, d20 \n" // b (0 .. 255)
- "vmovl.u8 q11, d22 \n"
- "vmovl.u8 q12, d24 \n"
- "vmovl.u8 q13, d26 \n"
+ "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmovl.u8 q10, d20 \n" // b (0 .. 255)
+ "vmovl.u8 q11, d22 \n"
+ "vmovl.u8 q12, d24 \n"
+ "vmovl.u8 q13, d26 \n"
"vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2
"vqrdmulh.s16 q11, q11, d0[1] \n" // g
"vqrdmulh.s16 q12, q12, d0[2] \n" // r
"vqrdmulh.s16 q13, q13, d0[3] \n" // a
- "vqmovn.u16 d20, q10 \n"
- "vqmovn.u16 d22, q11 \n"
- "vqmovn.u16 d24, q12 \n"
- "vqmovn.u16 d26, q13 \n"
- "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
+ "vqmovn.u16 d20, q10 \n"
+ "vqmovn.u16 d22, q11 \n"
+ "vqmovn.u16 d24, q12 \n"
+ "vqmovn.u16 d26, q13 \n"
+ "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -2291,20 +2368,20 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
// C code is (29 * b + 150 * g + 77 * r + 128) >> 8;
void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
asm volatile(
- "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient
- "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient
- "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient
+ "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient
+ "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient
+ "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
"vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit B
- "vmov d1, d0 \n" // G
- "vmov d2, d0 \n" // R
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
+ "vmov d1, d0 \n" // G
+ "vmov d2, d0 \n" // R
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -2318,32 +2395,32 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
// r = (r * 50 + g * 98 + b * 24) >> 7
void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
asm volatile(
- "vmov.u8 d20, #17 \n" // BB coefficient
- "vmov.u8 d21, #68 \n" // BG coefficient
- "vmov.u8 d22, #35 \n" // BR coefficient
- "vmov.u8 d24, #22 \n" // GB coefficient
- "vmov.u8 d25, #88 \n" // GG coefficient
- "vmov.u8 d26, #45 \n" // GR coefficient
- "vmov.u8 d28, #24 \n" // BB coefficient
- "vmov.u8 d29, #98 \n" // BG coefficient
- "vmov.u8 d30, #50 \n" // BR coefficient
+ "vmov.u8 d20, #17 \n" // BB coefficient
+ "vmov.u8 d21, #68 \n" // BG coefficient
+ "vmov.u8 d22, #35 \n" // BR coefficient
+ "vmov.u8 d24, #22 \n" // GB coefficient
+ "vmov.u8 d25, #88 \n" // GG coefficient
+ "vmov.u8 d26, #45 \n" // GR coefficient
+ "vmov.u8 d28, #24 \n" // BB coefficient
+ "vmov.u8 d29, #98 \n" // BG coefficient
+ "vmov.u8 d30, #50 \n" // BR coefficient
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels.
- "subs %1, %1, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d20 \n" // B to Sepia B
- "vmlal.u8 q2, d1, d21 \n" // G
- "vmlal.u8 q2, d2, d22 \n" // R
- "vmull.u8 q3, d0, d24 \n" // B to Sepia G
- "vmlal.u8 q3, d1, d25 \n" // G
- "vmlal.u8 q3, d2, d26 \n" // R
- "vmull.u8 q8, d0, d28 \n" // B to Sepia R
- "vmlal.u8 q8, d1, d29 \n" // G
- "vmlal.u8 q8, d2, d30 \n" // R
- "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B
- "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G
- "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R
- "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels.
+ "subs %1, %1, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d20 \n" // B to Sepia B
+ "vmlal.u8 q2, d1, d21 \n" // G
+ "vmlal.u8 q2, d2, d22 \n" // R
+ "vmull.u8 q3, d0, d24 \n" // B to Sepia G
+ "vmlal.u8 q3, d1, d25 \n" // G
+ "vmlal.u8 q3, d2, d26 \n" // R
+ "vmull.u8 q8, d0, d28 \n" // B to Sepia R
+ "vmlal.u8 q8, d1, d29 \n" // G
+ "vmlal.u8 q8, d2, d30 \n" // R
+ "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B
+ "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G
+ "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R
+ "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
:
@@ -2359,51 +2436,51 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
const int8_t* matrix_argb,
int width) {
asm volatile(
- "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors.
- "vmovl.s8 q0, d4 \n" // B,G coefficients s16.
- "vmovl.s8 q1, d5 \n" // R,A coefficients s16.
+ "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors.
+ "vmovl.s8 q0, d4 \n" // B,G coefficients s16.
+ "vmovl.s8 q1, d5 \n" // R,A coefficients s16.
"1: \n"
- "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit
- "vmovl.u8 q9, d18 \n" // g
- "vmovl.u8 q10, d20 \n" // r
- "vmovl.u8 q11, d22 \n" // a
- "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B
- "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G
- "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R
- "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A
- "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B
- "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G
- "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R
- "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A
- "vqadd.s16 q12, q12, q4 \n" // Accumulate B
- "vqadd.s16 q13, q13, q5 \n" // Accumulate G
- "vqadd.s16 q14, q14, q6 \n" // Accumulate R
- "vqadd.s16 q15, q15, q7 \n" // Accumulate A
- "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B
- "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G
- "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R
- "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A
- "vqadd.s16 q12, q12, q4 \n" // Accumulate B
- "vqadd.s16 q13, q13, q5 \n" // Accumulate G
- "vqadd.s16 q14, q14, q6 \n" // Accumulate R
- "vqadd.s16 q15, q15, q7 \n" // Accumulate A
- "vmul.s16 q4, q11, d0[3] \n" // B += A * Matrix B
- "vmul.s16 q5, q11, d1[3] \n" // G += A * Matrix G
- "vmul.s16 q6, q11, d2[3] \n" // R += A * Matrix R
- "vmul.s16 q7, q11, d3[3] \n" // A += A * Matrix A
- "vqadd.s16 q12, q12, q4 \n" // Accumulate B
- "vqadd.s16 q13, q13, q5 \n" // Accumulate G
- "vqadd.s16 q14, q14, q6 \n" // Accumulate R
- "vqadd.s16 q15, q15, q7 \n" // Accumulate A
+ "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit
+ "vmovl.u8 q9, d18 \n" // g
+ "vmovl.u8 q10, d20 \n" // r
+ "vmovl.u8 q11, d22 \n" // a
+ "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B
+ "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G
+ "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R
+ "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A
+ "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B
+ "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G
+ "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R
+ "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A
+ "vqadd.s16 q12, q12, q4 \n" // Accumulate B
+ "vqadd.s16 q13, q13, q5 \n" // Accumulate G
+ "vqadd.s16 q14, q14, q6 \n" // Accumulate R
+ "vqadd.s16 q15, q15, q7 \n" // Accumulate A
+ "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B
+ "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G
+ "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R
+ "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A
+ "vqadd.s16 q12, q12, q4 \n" // Accumulate B
+ "vqadd.s16 q13, q13, q5 \n" // Accumulate G
+ "vqadd.s16 q14, q14, q6 \n" // Accumulate R
+ "vqadd.s16 q15, q15, q7 \n" // Accumulate A
+ "vmul.s16 q4, q11, d0[3] \n" // B += A * Matrix B
+ "vmul.s16 q5, q11, d1[3] \n" // G += A * Matrix G
+ "vmul.s16 q6, q11, d2[3] \n" // R += A * Matrix R
+ "vmul.s16 q7, q11, d3[3] \n" // A += A * Matrix A
+ "vqadd.s16 q12, q12, q4 \n" // Accumulate B
+ "vqadd.s16 q13, q13, q5 \n" // Accumulate G
+ "vqadd.s16 q14, q14, q6 \n" // Accumulate R
+ "vqadd.s16 q15, q15, q7 \n" // Accumulate A
"vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B
"vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G
"vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R
"vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A
- "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
+ "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -2420,19 +2497,19 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
asm volatile(
// 8 pixel loop.
"1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vmull.u8 q0, d0, d1 \n" // multiply B
- "vmull.u8 q1, d2, d3 \n" // multiply G
- "vmull.u8 q2, d4, d5 \n" // multiply R
- "vmull.u8 q3, d6, d7 \n" // multiply A
- "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B
- "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G
- "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R
- "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vmull.u8 q0, d0, d1 \n" // multiply B
+ "vmull.u8 q1, d2, d3 \n" // multiply G
+ "vmull.u8 q2, d4, d5 \n" // multiply R
+ "vmull.u8 q3, d6, d7 \n" // multiply A
+ "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B
+ "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G
+ "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R
+ "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
@@ -2449,13 +2526,13 @@ void ARGBAddRow_NEON(const uint8_t* src_argb0,
asm volatile(
// 8 pixel loop.
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vqadd.u8 q0, q0, q2 \n" // add B, G
- "vqadd.u8 q1, q1, q3 \n" // add R, A
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 q0, q0, q2 \n" // add B, G
+ "vqadd.u8 q1, q1, q3 \n" // add R, A
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
@@ -2472,13 +2549,13 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
asm volatile(
// 8 pixel loop.
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vqsub.u8 q0, q0, q2 \n" // subtract B, G
- "vqsub.u8 q1, q1, q3 \n" // subtract R, A
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqsub.u8 q0, q0, q2 \n" // subtract B, G
+ "vqsub.u8 q1, q1, q3 \n" // subtract R, A
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
@@ -2497,17 +2574,17 @@ void SobelRow_NEON(const uint8_t* src_sobelx,
uint8_t* dst_argb,
int width) {
asm volatile(
- "vmov.u8 d3, #255 \n" // alpha
+ "vmov.u8 d3, #255 \n" // alpha
// 8 pixel loop.
"1: \n"
- "vld1.8 {d0}, [%0]! \n" // load 8 sobelx.
- "vld1.8 {d1}, [%1]! \n" // load 8 sobely.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vqadd.u8 d0, d0, d1 \n" // add
- "vmov.u8 d1, d0 \n"
- "vmov.u8 d2, d0 \n"
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
+ "vld1.8 {d0}, [%0]! \n" // load 8 sobelx.
+ "vld1.8 {d1}, [%1]! \n" // load 8 sobely.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 d0, d0, d1 \n" // add
+ "vmov.u8 d1, d0 \n"
+ "vmov.u8 d2, d0 \n"
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_argb), // %2
@@ -2524,12 +2601,12 @@ void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
asm volatile(
// 16 pixel loop.
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
- "vld1.8 {q1}, [%1]! \n" // load 16 sobely.
- "subs %3, %3, #16 \n" // 16 processed per loop.
- "vqadd.u8 q0, q0, q1 \n" // add
- "vst1.8 {q0}, [%2]! \n" // store 16 pixels.
- "bgt 1b \n"
+ "vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
+ "vld1.8 {q1}, [%1]! \n" // load 16 sobely.
+ "subs %3, %3, #16 \n" // 16 processed per loop.
+ "vqadd.u8 q0, q0, q1 \n" // add
+ "vst1.8 {q0}, [%2]! \n" // store 16 pixels.
+ "bgt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_y), // %2
@@ -2548,15 +2625,15 @@ void SobelXYRow_NEON(const uint8_t* src_sobelx,
uint8_t* dst_argb,
int width) {
asm volatile(
- "vmov.u8 d3, #255 \n" // alpha
+ "vmov.u8 d3, #255 \n" // alpha
// 8 pixel loop.
"1: \n"
- "vld1.8 {d2}, [%0]! \n" // load 8 sobelx.
- "vld1.8 {d0}, [%1]! \n" // load 8 sobely.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vqadd.u8 d1, d0, d2 \n" // add
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
+ "vld1.8 {d2}, [%0]! \n" // load 8 sobelx.
+ "vld1.8 {d0}, [%1]! \n" // load 8 sobely.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 d1, d0, d2 \n" // add
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_argb), // %2
@@ -2576,23 +2653,23 @@ void SobelXRow_NEON(const uint8_t* src_y0,
int width) {
asm volatile(
"1: \n"
- "vld1.8 {d0}, [%0],%5 \n" // top
- "vld1.8 {d1}, [%0],%6 \n"
- "vsubl.u8 q0, d0, d1 \n"
- "vld1.8 {d2}, [%1],%5 \n" // center * 2
- "vld1.8 {d3}, [%1],%6 \n"
- "vsubl.u8 q1, d2, d3 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vld1.8 {d2}, [%2],%5 \n" // bottom
- "vld1.8 {d3}, [%2],%6 \n"
- "subs %4, %4, #8 \n" // 8 pixels
- "vsubl.u8 q1, d2, d3 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vabs.s16 q0, q0 \n"
- "vqmovn.u16 d0, q0 \n"
- "vst1.8 {d0}, [%3]! \n" // store 8 sobelx
- "bgt 1b \n"
+ "vld1.8 {d0}, [%0],%5 \n" // top
+ "vld1.8 {d1}, [%0],%6 \n"
+ "vsubl.u8 q0, d0, d1 \n"
+ "vld1.8 {d2}, [%1],%5 \n" // center * 2
+ "vld1.8 {d3}, [%1],%6 \n"
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vld1.8 {d2}, [%2],%5 \n" // bottom
+ "vld1.8 {d3}, [%2],%6 \n"
+ "subs %4, %4, #8 \n" // 8 pixels
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vabs.s16 q0, q0 \n"
+ "vqmovn.u16 d0, q0 \n"
+ "vst1.8 {d0}, [%3]! \n" // store 8 sobelx
+ "bgt 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
"+r"(src_y2), // %2
@@ -2614,23 +2691,23 @@ void SobelYRow_NEON(const uint8_t* src_y0,
int width) {
asm volatile(
"1: \n"
- "vld1.8 {d0}, [%0],%4 \n" // left
- "vld1.8 {d1}, [%1],%4 \n"
- "vsubl.u8 q0, d0, d1 \n"
- "vld1.8 {d2}, [%0],%4 \n" // center * 2
- "vld1.8 {d3}, [%1],%4 \n"
- "vsubl.u8 q1, d2, d3 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vld1.8 {d2}, [%0],%5 \n" // right
- "vld1.8 {d3}, [%1],%5 \n"
- "subs %3, %3, #8 \n" // 8 pixels
- "vsubl.u8 q1, d2, d3 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vabs.s16 q0, q0 \n"
- "vqmovn.u16 d0, q0 \n"
- "vst1.8 {d0}, [%2]! \n" // store 8 sobely
- "bgt 1b \n"
+ "vld1.8 {d0}, [%0],%4 \n" // left
+ "vld1.8 {d1}, [%1],%4 \n"
+ "vsubl.u8 q0, d0, d1 \n"
+ "vld1.8 {d2}, [%0],%4 \n" // center * 2
+ "vld1.8 {d3}, [%1],%4 \n"
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vld1.8 {d2}, [%0],%5 \n" // right
+ "vld1.8 {d3}, [%1],%5 \n"
+ "subs %3, %3, #8 \n" // 8 pixels
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vabs.s16 q0, q0 \n"
+ "vqmovn.u16 d0, q0 \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 sobely
+ "bgt 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
"+r"(dst_sobely), // %2
@@ -2652,18 +2729,18 @@ void HalfFloat1Row_NEON(const uint16_t* src,
asm volatile(
"1: \n"
- "vld1.8 {q1}, [%0]! \n" // load 8 shorts
- "subs %2, %2, #8 \n" // 8 pixels per loop
- "vmovl.u16 q2, d2 \n" // 8 int's
- "vmovl.u16 q3, d3 \n"
- "vcvt.f32.u32 q2, q2 \n" // 8 floats
- "vcvt.f32.u32 q3, q3 \n"
- "vmul.f32 q2, q2, %y3 \n" // adjust exponent
- "vmul.f32 q3, q3, %y3 \n"
- "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
- "vqshrn.u32 d3, q3, #13 \n"
- "vst1.8 {q1}, [%1]! \n"
- "bgt 1b \n"
+ "vld1.8 {q1}, [%0]! \n" // load 8 shorts
+ "subs %2, %2, #8 \n" // 8 pixels per loop
+ "vmovl.u16 q2, d2 \n" // 8 int's
+ "vmovl.u16 q3, d3 \n"
+ "vcvt.f32.u32 q2, q2 \n" // 8 floats
+ "vcvt.f32.u32 q3, q3 \n"
+ "vmul.f32 q2, q2, %y3 \n" // adjust exponent
+ "vmul.f32 q3, q3, %y3 \n"
+ "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
+ "vqshrn.u32 d3, q3, #13 \n"
+ "vst1.8 {q1}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -2678,18 +2755,18 @@ void HalfFloatRow_NEON(const uint16_t* src,
asm volatile(
"1: \n"
- "vld1.8 {q1}, [%0]! \n" // load 8 shorts
- "subs %2, %2, #8 \n" // 8 pixels per loop
- "vmovl.u16 q2, d2 \n" // 8 int's
- "vmovl.u16 q3, d3 \n"
- "vcvt.f32.u32 q2, q2 \n" // 8 floats
- "vcvt.f32.u32 q3, q3 \n"
- "vmul.f32 q2, q2, %y3 \n" // adjust exponent
- "vmul.f32 q3, q3, %y3 \n"
- "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
- "vqshrn.u32 d3, q3, #13 \n"
- "vst1.8 {q1}, [%1]! \n"
- "bgt 1b \n"
+ "vld1.8 {q1}, [%0]! \n" // load 8 shorts
+ "subs %2, %2, #8 \n" // 8 pixels per loop
+ "vmovl.u16 q2, d2 \n" // 8 int's
+ "vmovl.u16 q3, d3 \n"
+ "vcvt.f32.u32 q2, q2 \n" // 8 floats
+ "vcvt.f32.u32 q3, q3 \n"
+ "vmul.f32 q2, q2, %y3 \n" // adjust exponent
+ "vmul.f32 q3, q3, %y3 \n"
+ "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
+ "vqshrn.u32 d3, q3, #13 \n"
+ "vst1.8 {q1}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -2704,17 +2781,17 @@ void ByteToFloatRow_NEON(const uint8_t* src,
asm volatile(
"1: \n"
- "vld1.8 {d2}, [%0]! \n" // load 8 bytes
- "subs %2, %2, #8 \n" // 8 pixels per loop
- "vmovl.u8 q1, d2 \n" // 8 shorts
- "vmovl.u16 q2, d2 \n" // 8 ints
- "vmovl.u16 q3, d3 \n"
- "vcvt.f32.u32 q2, q2 \n" // 8 floats
- "vcvt.f32.u32 q3, q3 \n"
- "vmul.f32 q2, q2, %y3 \n" // scale
- "vmul.f32 q3, q3, %y3 \n"
- "vst1.8 {q2, q3}, [%1]! \n" // store 8 floats
- "bgt 1b \n"
+ "vld1.8 {d2}, [%0]! \n" // load 8 bytes
+ "subs %2, %2, #8 \n" // 8 pixels per loop
+ "vmovl.u8 q1, d2 \n" // 8 shorts
+ "vmovl.u16 q2, d2 \n" // 8 ints
+ "vmovl.u16 q3, d3 \n"
+ "vcvt.f32.u32 q2, q2 \n" // 8 floats
+ "vcvt.f32.u32 q3, q3 \n"
+ "vmul.f32 q2, q2, %y3 \n" // scale
+ "vmul.f32 q3, q3, %y3 \n"
+ "vst1.8 {q2, q3}, [%1]! \n" // store 8 floats
+ "bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -2731,26 +2808,26 @@ void GaussCol_NEON(const uint16_t* src0,
uint32_t* dst,
int width) {
asm volatile(
- "vmov.u16 d6, #4 \n" // constant 4
- "vmov.u16 d7, #6 \n" // constant 6
+ "vmov.u16 d6, #4 \n" // constant 4
+ "vmov.u16 d7, #6 \n" // constant 6
"1: \n"
- "vld1.16 {q1}, [%0]! \n" // load 8 samples, 5 rows
- "vld1.16 {q2}, [%4]! \n"
- "vaddl.u16 q0, d2, d4 \n" // * 1
- "vaddl.u16 q1, d3, d5 \n" // * 1
- "vld1.16 {q2}, [%1]! \n"
- "vmlal.u16 q0, d4, d6 \n" // * 4
- "vmlal.u16 q1, d5, d6 \n" // * 4
- "vld1.16 {q2}, [%2]! \n"
- "vmlal.u16 q0, d4, d7 \n" // * 6
- "vmlal.u16 q1, d5, d7 \n" // * 6
- "vld1.16 {q2}, [%3]! \n"
- "vmlal.u16 q0, d4, d6 \n" // * 4
- "vmlal.u16 q1, d5, d6 \n" // * 4
- "subs %6, %6, #8 \n" // 8 processed per loop
- "vst1.32 {q0, q1}, [%5]! \n" // store 8 samples
- "bgt 1b \n"
+ "vld1.16 {q1}, [%0]! \n" // load 8 samples, 5 rows
+ "vld1.16 {q2}, [%4]! \n"
+ "vaddl.u16 q0, d2, d4 \n" // * 1
+ "vaddl.u16 q1, d3, d5 \n" // * 1
+ "vld1.16 {q2}, [%1]! \n"
+ "vmlal.u16 q0, d4, d6 \n" // * 4
+ "vmlal.u16 q1, d5, d6 \n" // * 4
+ "vld1.16 {q2}, [%2]! \n"
+ "vmlal.u16 q0, d4, d7 \n" // * 6
+ "vmlal.u16 q1, d5, d7 \n" // * 6
+ "vld1.16 {q2}, [%3]! \n"
+ "vmlal.u16 q0, d4, d6 \n" // * 4
+ "vmlal.u16 q1, d5, d6 \n" // * 4
+ "subs %6, %6, #8 \n" // 8 processed per loop
+ "vst1.32 {q0, q1}, [%5]! \n" // store 8 samples
+ "bgt 1b \n"
: "+r"(src0), // %0
"+r"(src1), // %1
"+r"(src2), // %2
@@ -2768,8 +2845,8 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
const uint32_t* src2 = src + 2;
const uint32_t* src3 = src + 3;
asm volatile(
- "vmov.u32 q10, #4 \n" // constant 4
- "vmov.u32 q11, #6 \n" // constant 6
+ "vmov.u32 q10, #4 \n" // constant 4
+ "vmov.u32 q11, #6 \n" // constant 6
"1: \n"
"vld1.32 {q0, q1}, [%0]! \n" // load 12 source samples
@@ -2807,16 +2884,16 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
int width) {
asm volatile(
"1: \n"
- "vld1.8 {q2}, [%0]! \n" // load 16 Y values
- "vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values
- "vmov d1, d0 \n"
- "vzip.u8 d0, d1 \n" // VV
- "vmov d3, d2 \n"
- "vzip.u8 d2, d3 \n" // UU
- "subs %3, %3, #16 \n" // 16 pixels per loop
- "vst3.8 {d0, d2, d4}, [%2]! \n" // store 16 YUV pixels
- "vst3.8 {d1, d3, d5}, [%2]! \n"
- "bgt 1b \n"
+ "vld1.8 {q2}, [%0]! \n" // load 16 Y values
+ "vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values
+ "vmov d1, d0 \n"
+ "vzip.u8 d0, d1 \n" // VV
+ "vmov d3, d2 \n"
+ "vzip.u8 d2, d3 \n" // UU
+ "subs %3, %3, #16 \n" // 16 pixels per loop
+ "vst3.8 {d0, d2, d4}, [%2]! \n" // store 16 YUV pixels
+ "vst3.8 {d1, d3, d5}, [%2]! \n"
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_vu), // %1
"+r"(dst_yuv24), // %2
@@ -2830,24 +2907,24 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
uint8_t* dst_uv,
int width) {
asm volatile(
- "add %1, %0, %1 \n" // src_stride + src_AYUV
+ "add %1, %0, %1 \n" // src_stride + src_AYUV
"1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV
// pixels.
- "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts.
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV
+ "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV
// pixels.
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV
// pixels.
- "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
"vqrshrun.s16 d1, q0, #2 \n" // 2x2 average
"vqrshrun.s16 d0, q1, #2 \n"
- "subs %3, %3, #16 \n" // 16 processed per loop.
- "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels UV.
- "bgt 1b \n"
+ "subs %3, %3, #16 \n" // 16 processed per loop.
+ "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels UV.
+ "bgt 1b \n"
: "+r"(src_ayuv), // %0
"+r"(src_stride_ayuv), // %1
"+r"(dst_uv), // %2
@@ -2861,24 +2938,24 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
uint8_t* dst_vu,
int width) {
asm volatile(
- "add %1, %0, %1 \n" // src_stride + src_AYUV
+ "add %1, %0, %1 \n" // src_stride + src_AYUV
"1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV
// pixels.
- "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts.
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV
+ "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV
// pixels.
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV
// pixels.
- "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
"vqrshrun.s16 d0, q0, #2 \n" // 2x2 average
"vqrshrun.s16 d1, q1, #2 \n"
- "subs %3, %3, #16 \n" // 16 processed per loop.
- "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels VU.
- "bgt 1b \n"
+ "subs %3, %3, #16 \n" // 16 processed per loop.
+ "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels VU.
+ "bgt 1b \n"
: "+r"(src_ayuv), // %0
"+r"(src_stride_ayuv), // %1
"+r"(dst_vu), // %2
@@ -2892,11 +2969,11 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
asm volatile(
"1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels
- "subs %2, %2, #16 \n" // 16 processed per loop
- "vst1.8 {q2}, [%1]! \n" // store 16 Y's.
- "bgt 1b \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "vst1.8 {q2}, [%1]! \n" // store 16 Y's.
+ "bgt 1b \n"
: "+r"(src_ayuv), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -2908,12 +2985,12 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
asm volatile(
"1: \n"
- "vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values
- "vld2.8 {d1, d3}, [%0]! \n"
- "vorr.u8 q2, q0, q0 \n" // move U after V
- "subs %2, %2, #16 \n" // 16 pixels per loop
- "vst2.8 {q1, q2}, [%1]! \n" // store 16 VU pixels
- "bgt 1b \n"
+ "vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values
+ "vld2.8 {d1, d3}, [%0]! \n"
+ "vorr.u8 q2, q0, q0 \n" // move U after V
+ "subs %2, %2, #16 \n" // 16 pixels per loop
+ "vst2.8 {q1, q2}, [%1]! \n" // store 16 VU pixels
+ "bgt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_vu), // %1
"+r"(width) // %2
@@ -2921,6 +2998,39 @@ void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
: "cc", "memory", "q0", "q1", "q2");
}
+void HalfMergeUVRow_NEON(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width) {
+ const uint8_t* src_u_1 = src_u + src_stride_u;
+ const uint8_t* src_v_1 = src_v + src_stride_v;
+ asm volatile(
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 16 U values
+ "vld1.8 {q1}, [%2]! \n" // load 16 V values
+ "vld1.8 {q2}, [%1]! \n"
+ "vld1.8 {q3}, [%3]! \n"
+ "vpaddl.u8 q0, q0 \n" // half size
+ "vpaddl.u8 q1, q1 \n"
+ "vpadal.u8 q0, q2 \n"
+ "vpadal.u8 q1, q3 \n"
+ "vqrshrn.u16 d0, q0, #2 \n"
+ "vqrshrn.u16 d1, q1, #2 \n"
+ "subs %5, %5, #16 \n" // 16 src pixels per loop
+ "vst2.8 {d0, d1}, [%4]! \n" // store 8 UV pixels
+ "bgt 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_u_1), // %1
+ "+r"(src_v), // %2
+ "+r"(src_v_1), // %3
+ "+r"(dst_uv), // %4
+ "+r"(width) // %5
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..
#ifdef __cplusplus
diff --git a/TMessagesProj/jni/third_party/libyuv/source/row_neon64.cc b/TMessagesProj/jni/third_party/libyuv/source/row_neon64.cc
index 866e7bfc6..d5258a3ae 100644
--- a/TMessagesProj/jni/third_party/libyuv/source/row_neon64.cc
+++ b/TMessagesProj/jni/third_party/libyuv/source/row_neon64.cc
@@ -68,13 +68,13 @@ extern "C" {
"uzp2 v3.8b, v2.8b, v2.8b \n" \
"ins v1.s[1], v3.s[0] \n"
-#define YUVTORGB_SETUP \
- "ld1r {v24.8h}, [%[kUVBiasBGR]], #2 \n" \
- "ld1r {v25.8h}, [%[kUVBiasBGR]], #2 \n" \
- "ld1r {v26.8h}, [%[kUVBiasBGR]] \n" \
- "ld1r {v31.4s}, [%[kYToRgb]] \n" \
- "ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \
- "ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n"
+#define YUVTORGB_SETUP \
+ "ld3r {v24.8h, v25.8h, v26.8h}, [%[kUVBiasBGR]] \n" \
+ "ld1r {v31.4s}, [%[kYToRgb]] \n" \
+ "ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \
+ "ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n"
+
+// clang-format off
#define YUVTORGB(vR, vG, vB) \
"uxtl v0.8h, v0.8b \n" /* Extract Y */ \
@@ -89,29 +89,23 @@ extern "C" {
"mov v2.d[0], v1.d[1] \n" /* Extract V */ \
"uxtl v2.8h, v2.8b \n" \
"uxtl v1.8h, v1.8b \n" /* Extract U */ \
- "mul v3.8h, v1.8h, v27.8h \n" \
- "mul v5.8h, v1.8h, v29.8h \n" \
- "mul v6.8h, v2.8h, v30.8h \n" \
- "mul v7.8h, v2.8h, v28.8h \n" \
+ "mul v3.8h, v27.8h, v1.8h \n" \
+ "mul v5.8h, v29.8h, v1.8h \n" \
+ "mul v6.8h, v30.8h, v2.8h \n" \
+ "mul v7.8h, v28.8h, v2.8h \n" \
"sqadd v6.8h, v6.8h, v5.8h \n" \
- "sqadd " #vB \
- ".8h, v24.8h, v0.8h \n" /* B */ \
- "sqadd " #vG \
- ".8h, v25.8h, v0.8h \n" /* G */ \
- "sqadd " #vR \
- ".8h, v26.8h, v0.8h \n" /* R */ \
- "sqadd " #vB ".8h, " #vB \
- ".8h, v3.8h \n" /* B */ \
- "sqsub " #vG ".8h, " #vG \
- ".8h, v6.8h \n" /* G */ \
- "sqadd " #vR ".8h, " #vR \
- ".8h, v7.8h \n" /* R */ \
- "sqshrun " #vB ".8b, " #vB \
- ".8h, #6 \n" /* B */ \
- "sqshrun " #vG ".8b, " #vG \
- ".8h, #6 \n" /* G */ \
+ "sqadd " #vB ".8h, v24.8h, v0.8h \n" /* B */ \
+ "sqadd " #vG ".8h, v25.8h, v0.8h \n" /* G */ \
+ "sqadd " #vR ".8h, v26.8h, v0.8h \n" /* R */ \
+ "sqadd " #vB ".8h, " #vB ".8h, v3.8h \n" /* B */ \
+ "sqsub " #vG ".8h, " #vG ".8h, v6.8h \n" /* G */ \
+ "sqadd " #vR ".8h, " #vR ".8h, v7.8h \n" /* R */ \
+ "sqshrun " #vB ".8b, " #vB ".8h, #6 \n" /* B */ \
+ "sqshrun " #vG ".8b, " #vG ".8h, #6 \n" /* G */ \
"sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */
+// clang-format on
+
void I444ToARGBRow_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
@@ -120,13 +114,16 @@ void I444ToARGBRow_NEON(const uint8_t* src_y,
int width) {
asm volatile (
YUVTORGB_SETUP
- "movi v23.8b, #255 \n" /* A */
- "1: \n"
+ "movi v23.8b, #255 \n" /* A */
+ "1: \n"
READYUV444
+ "prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v22, v21, v20)
- "subs %w4, %w4, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
- "b.gt 1b \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "subs %w4, %w4, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -149,13 +146,17 @@ void I422ToARGBRow_NEON(const uint8_t* src_y,
int width) {
asm volatile (
YUVTORGB_SETUP
- "movi v23.8b, #255 \n" /* A */
- "1: \n"
+ "movi v23.8b, #255 \n" /* A */
+
+ "1: \n"
READYUV422
+ "prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v22, v21, v20)
- "subs %w4, %w4, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
- "b.gt 1b \n"
+ "prfm pldl1keep, [%1, 128] \n"
+ "prfm pldl1keep, [%2, 128] \n"
+ "subs %w4, %w4, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -179,13 +180,17 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
int width) {
asm volatile (
YUVTORGB_SETUP
- "1: \n"
+ "1: \n"
READYUV422
+ "prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v22, v21, v20)
- "ld1 {v23.8b}, [%3], #8 \n"
- "subs %w5, %w5, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n"
- "b.gt 1b \n"
+ "ld1 {v23.8b}, [%3], #8 \n"
+ "prfm pldl1keep, [%1, 128] \n"
+ "prfm pldl1keep, [%2, 128] \n"
+ "prfm pldl1keep, [%3, 448] \n"
+ "subs %w5, %w5, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -209,13 +214,16 @@ void I422ToRGBARow_NEON(const uint8_t* src_y,
int width) {
asm volatile (
YUVTORGB_SETUP
- "movi v20.8b, #255 \n" /* A */
- "1: \n"
+ "movi v20.8b, #255 \n" /* A */
+ "1: \n"
READYUV422
+ "prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v23, v22, v21)
- "subs %w4, %w4, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
- "b.gt 1b \n"
+ "prfm pldl1keep, [%1, 128] \n"
+ "prfm pldl1keep, [%2, 128] \n"
+ "subs %w4, %w4, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -238,12 +246,15 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y,
int width) {
asm volatile (
YUVTORGB_SETUP
- "1: \n"
+ "1: \n"
READYUV422
+ "prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v22, v21, v20)
- "subs %w4, %w4, #8 \n"
- "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
- "b.gt 1b \n"
+ "prfm pldl1keep, [%1, 128] \n"
+ "prfm pldl1keep, [%2, 128] \n"
+ "subs %w4, %w4, #8 \n"
+ "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -265,6 +276,8 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y,
"sri v0.8h, v21.8h, #5 \n" /* RG */ \
"sri v0.8h, v20.8h, #11 \n" /* RGB */
+// clang-format off
+
void I422ToRGB565Row_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
@@ -272,13 +285,17 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(
- YUVTORGB_SETUP
- "1: \n" READYUV422 YUVTORGB(
- v22, v21,
- v20) "subs %w4, %w4, #8 \n" ARGBTORGB565
- "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels
- // RGB565.
- "b.gt 1b \n"
+ YUVTORGB_SETUP
+ "1: \n"
+ READYUV422
+ YUVTORGB(v22, v21, v20)
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w4, %w4, #8 \n"
+ ARGBTORGB565
+ "prfm pldl1keep, [%1, 128] \n"
+ "prfm pldl1keep, [%2, 128] \n"
+ "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -308,14 +325,18 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
- "1: \n" READYUV422 YUVTORGB(
- v22, v21,
- v20) "subs %w4, %w4, #8 \n" ARGBTOARGB1555
- "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels
- // RGB565.
- "b.gt 1b \n"
+ YUVTORGB_SETUP
+ "movi v23.8b, #255 \n"
+ "1: \n"
+ READYUV422
+ YUVTORGB(v22, v21, v20)
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w4, %w4, #8 \n"
+ ARGBTOARGB1555
+ "prfm pldl1keep, [%1, 128] \n"
+ "prfm pldl1keep, [%2, 128] \n"
+ "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -328,6 +349,7 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
}
+// clang-format on
#define ARGBTOARGB4444 \
/* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \
@@ -347,15 +369,18 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y,
int width) {
asm volatile (
YUVTORGB_SETUP
- "movi v4.16b, #0x0f \n" // bits to clear with vbic.
- "1: \n"
+ "movi v4.16b, #0x0f \n" // bits to clear with vbic.
+ "1: \n"
READYUV422
YUVTORGB(v22, v21, v20)
- "subs %w4, %w4, #8 \n"
- "movi v23.8b, #255 \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w4, %w4, #8 \n"
+ "movi v23.8b, #255 \n"
ARGBTOARGB4444
- "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444.
- "b.gt 1b \n"
+ "prfm pldl1keep, [%1, 128] \n"
+ "prfm pldl1keep, [%2, 128] \n"
+ "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444.
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -370,23 +395,27 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y,
);
}
-void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+void I400ToARGBRow_NEON(const uint8_t* src_y,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
asm volatile (
YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
- "1: \n"
+ "movi v23.8b, #255 \n"
+ "1: \n"
READYUV400
YUVTORGB(v22, v21, v20)
- "subs %w2, %w2, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
- "b.gt 1b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
- : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),
- [kUVToG]"r"(&kYuvI601Constants.kUVToG),
- [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),
- [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
@@ -394,14 +423,15 @@ void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
asm volatile(
- "movi v23.8b, #255 \n"
+ "movi v23.8b, #255 \n"
"1: \n"
- "ld1 {v20.8b}, [%0], #8 \n"
- "orr v21.8b, v20.8b, v20.8b \n"
- "orr v22.8b, v20.8b, v20.8b \n"
- "subs %w2, %w2, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
- "b.gt 1b \n"
+ "ld1 {v20.8b}, [%0], #8 \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "orr v21.8b, v20.8b, v20.8b \n"
+ "orr v22.8b, v20.8b, v20.8b \n"
+ "subs %w2, %w2, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -416,13 +446,15 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y,
int width) {
asm volatile (
YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
- "1: \n"
+ "movi v23.8b, #255 \n"
+ "1: \n"
READNV12
+ "prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v22, v21, v20)
- "subs %w3, %w3, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
- "b.gt 1b \n"
+ "prfm pldl1keep, [%1, 256] \n"
+ "subs %w3, %w3, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_uv), // %1
"+r"(dst_argb), // %2
@@ -443,13 +475,15 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y,
int width) {
asm volatile (
YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
- "1: \n"
+ "movi v23.8b, #255 \n"
+ "1: \n"
READNV21
+ "prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v22, v21, v20)
- "subs %w3, %w3, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
- "b.gt 1b \n"
+ "prfm pldl1keep, [%1, 256] \n"
+ "subs %w3, %w3, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_vu), // %1
"+r"(dst_argb), // %2
@@ -470,12 +504,14 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y,
int width) {
asm volatile (
YUVTORGB_SETUP
- "1: \n"
+ "1: \n"
READNV12
+ "prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v22, v21, v20)
- "subs %w3, %w3, #8 \n"
- "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n"
- "b.gt 1b \n"
+ "prfm pldl1keep, [%1, 256] \n"
+ "subs %w3, %w3, #8 \n"
+ "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_uv), // %1
"+r"(dst_rgb24), // %2
@@ -496,12 +532,14 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y,
int width) {
asm volatile (
YUVTORGB_SETUP
- "1: \n"
+ "1: \n"
READNV21
+ "prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v22, v21, v20)
- "subs %w3, %w3, #8 \n"
- "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n"
- "b.gt 1b \n"
+ "prfm pldl1keep, [%1, 256] \n"
+ "subs %w3, %w3, #8 \n"
+ "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_vu), // %1
"+r"(dst_rgb24), // %2
@@ -521,13 +559,13 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(
- YUVTORGB_SETUP
- "1: \n" READNV12 YUVTORGB(
- v22, v21,
- v20) "subs %w3, %w3, #8 \n" ARGBTORGB565
- "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels
- // RGB565.
- "b.gt 1b \n"
+ YUVTORGB_SETUP "1: \n" READNV12
+ "prfm pldl1keep, [%0, 448] \n" YUVTORGB(
+ v22, v21, v20) ARGBTORGB565
+ "prfm pldl1keep, [%1, 256] \n"
+ "subs %w3, %w3, #8 \n"
+ "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_uv), // %1
"+r"(dst_rgb565), // %2
@@ -546,13 +584,14 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
int width) {
asm volatile (
YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
- "1: \n"
+ "movi v23.8b, #255 \n"
+ "1: \n"
READYUY2
+ "prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v22, v21, v20)
- "subs %w2, %w2, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
- "b.gt 1b \n"
+ "subs %w2, %w2, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -571,13 +610,14 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
int width) {
asm volatile (
YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
- "1: \n"
+ "movi v23.8b, #255 \n"
+ "1: \n"
READUYVY
YUVTORGB(v22, v21, v20)
- "subs %w2, %w2, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"
- "b.gt 1b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"
+ "b.gt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -597,11 +637,12 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
int width) {
asm volatile(
"1: \n"
- "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV
- "subs %w3, %w3, #16 \n" // 16 processed per loop
- "st1 {v0.16b}, [%1], #16 \n" // store U
- "st1 {v1.16b}, [%2], #16 \n" // store V
- "b.gt 1b \n"
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w3, %w3, #16 \n" // 16 processed per loop
+ "st1 {v0.16b}, [%1], #16 \n" // store U
+ "st1 {v1.16b}, [%2], #16 \n" // store V
+ "b.gt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -618,11 +659,13 @@ void MergeUVRow_NEON(const uint8_t* src_u,
int width) {
asm volatile(
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load U
- "ld1 {v1.16b}, [%1], #16 \n" // load V
- "subs %w3, %w3, #16 \n" // 16 processed per loop
- "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV
- "b.gt 1b \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load U
+ "ld1 {v1.16b}, [%1], #16 \n" // load V
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #16 \n" // 16 processed per loop
+ "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV
+ "b.gt 1b \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
@@ -640,12 +683,13 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
int width) {
asm volatile(
"1: \n"
- "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB
- "subs %w4, %w4, #16 \n" // 16 processed per loop
- "st1 {v0.16b}, [%1], #16 \n" // store R
- "st1 {v1.16b}, [%2], #16 \n" // store G
- "st1 {v2.16b}, [%3], #16 \n" // store B
- "b.gt 1b \n"
+ "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w4, %w4, #16 \n" // 16 processed per loop
+ "st1 {v0.16b}, [%1], #16 \n" // store R
+ "st1 {v1.16b}, [%2], #16 \n" // store G
+ "st1 {v2.16b}, [%3], #16 \n" // store B
+ "b.gt 1b \n"
: "+r"(src_rgb), // %0
"+r"(dst_r), // %1
"+r"(dst_g), // %2
@@ -664,12 +708,16 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
int width) {
asm volatile(
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load R
- "ld1 {v1.16b}, [%1], #16 \n" // load G
- "ld1 {v2.16b}, [%2], #16 \n" // load B
- "subs %w4, %w4, #16 \n" // 16 processed per loop
- "st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB
- "b.gt 1b \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load R
+ "ld1 {v1.16b}, [%1], #16 \n" // load G
+ "ld1 {v2.16b}, [%2], #16 \n" // load B
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "subs %w4, %w4, #16 \n" // 16 processed per loop
+ "st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB
+ "prfm pldl1keep, [%0, 448] \n"
+ "b.gt 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
@@ -684,10 +732,11 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
"1: \n"
- "ldp q0, q1, [%0], #32 \n"
- "subs %w2, %w2, #32 \n" // 32 processed per loop
- "stp q0, q1, [%1], #32 \n"
- "b.gt 1b \n"
+ "ldp q0, q1, [%0], #32 \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #32 \n" // 32 processed per loop
+ "stp q0, q1, [%1], #32 \n"
+ "b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2 // Output registers
@@ -699,11 +748,11 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
// SetRow writes 'width' bytes using an 8 bit value repeated.
void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
asm volatile(
- "dup v0.16b, %w2 \n" // duplicate 16 bytes
+ "dup v0.16b, %w2 \n" // duplicate 16 bytes
"1: \n"
- "subs %w1, %w1, #16 \n" // 16 bytes per loop
- "st1 {v0.16b}, [%0], #16 \n" // store
- "b.gt 1b \n"
+ "subs %w1, %w1, #16 \n" // 16 bytes per loop
+ "st1 {v0.16b}, [%0], #16 \n" // store
+ "b.gt 1b \n"
: "+r"(dst), // %0
"+r"(width) // %1
: "r"(v8) // %2
@@ -712,89 +761,157 @@ void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
asm volatile(
- "dup v0.4s, %w2 \n" // duplicate 4 ints
+ "dup v0.4s, %w2 \n" // duplicate 4 ints
"1: \n"
- "subs %w1, %w1, #4 \n" // 4 ints per loop
- "st1 {v0.16b}, [%0], #16 \n" // store
- "b.gt 1b \n"
+ "subs %w1, %w1, #4 \n" // 4 ints per loop
+ "st1 {v0.16b}, [%0], #16 \n" // store
+ "b.gt 1b \n"
: "+r"(dst), // %0
"+r"(width) // %1
: "r"(v32) // %2
: "cc", "memory", "v0");
}
+// Shuffle table for reversing the bytes.
+static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
+ 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
+
void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
// Start at end of source row.
- "add %0, %0, %w2, sxtw \n"
- "sub %0, %0, #16 \n"
+ "ld1 {v3.16b}, [%3] \n" // shuffler
+ "add %0, %0, %w2, sxtw \n"
+ "sub %0, %0, #32 \n"
"1: \n"
- "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
- "subs %w2, %w2, #16 \n" // 16 pixels per loop.
- "rev64 v0.16b, v0.16b \n"
- "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
- "st1 {v0.D}[0], [%1], #8 \n"
- "b.gt 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "r"((ptrdiff_t)-16) // %3
- : "cc", "memory", "v0");
+ "ldr q2, [%0, 16] \n"
+ "ldr q1, [%0], -32 \n" // src -= 32
+ "subs %w2, %w2, #32 \n" // 32 pixels per loop.
+ "tbl v0.16b, {v2.16b}, v3.16b \n"
+ "tbl v1.16b, {v1.16b}, v3.16b \n"
+ "st1 {v0.16b, v1.16b}, [%1], #32 \n" // store 32 pixels
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"(&kShuffleMirror) // %3
+ : "cc", "memory", "v0", "v1", "v2", "v3");
}
-void MirrorUVRow_NEON(const uint8_t* src_uv,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
+// Shuffle table for reversing the UV.
+static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
+ 6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u};
+
+void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
asm volatile(
// Start at end of source row.
- "add %0, %0, %w3, sxtw #1 \n"
- "sub %0, %0, #16 \n"
+ "ld1 {v4.16b}, [%3] \n" // shuffler
+ "add %0, %0, %w2, sxtw #1 \n"
+ "sub %0, %0, #32 \n"
"1: \n"
- "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16
- "subs %w3, %w3, #8 \n" // 8 pixels per loop.
- "rev64 v0.8b, v0.8b \n"
- "rev64 v1.8b, v1.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // dst += 8
- "st1 {v1.8b}, [%2], #8 \n"
- "b.gt 1b \n"
- : "+r"(src_uv), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- : "r"((ptrdiff_t)-16) // %4
- : "cc", "memory", "v0", "v1");
+ "ldr q1, [%0, 16] \n"
+ "ldr q0, [%0], -32 \n" // src -= 32
+ "subs %w2, %w2, #16 \n" // 16 pixels per loop.
+ "tbl v2.16b, {v1.16b}, v4.16b \n"
+ "tbl v3.16b, {v0.16b}, v4.16b \n"
+ "st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32
+ "b.gt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_uv), // %1
+ "+r"(width) // %2
+ : "r"(&kShuffleMirrorUV) // %3
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
}
-void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
asm volatile(
// Start at end of source row.
- "add %0, %0, %w2, sxtw #2 \n"
- "sub %0, %0, #16 \n"
+ "ld1 {v4.16b}, [%4] \n" // shuffler
+ "add %0, %0, %w3, sxtw #1 \n"
+ "sub %0, %0, #32 \n"
"1: \n"
- "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
- "subs %w2, %w2, #4 \n" // 4 pixels per loop.
- "rev64 v0.4s, v0.4s \n"
- "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
- "st1 {v0.D}[0], [%1], #8 \n"
- "b.gt 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "r"((ptrdiff_t)-16) // %3
- : "cc", "memory", "v0");
+ "ldr q1, [%0, 16] \n"
+ "ldr q0, [%0], -32 \n" // src -= 32
+ "subs %w3, %w3, #16 \n" // 16 pixels per loop.
+ "tbl v2.16b, {v1.16b}, v4.16b \n"
+ "tbl v3.16b, {v0.16b}, v4.16b \n"
+ "uzp1 v0.16b, v2.16b, v3.16b \n" // U
+ "uzp2 v1.16b, v2.16b, v3.16b \n" // V
+ "st1 {v0.16b}, [%1], #16 \n" // dst += 16
+ "st1 {v1.16b}, [%2], #16 \n"
+ "b.gt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "r"(&kShuffleMirrorUV) // %4
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+
+// Shuffle table for reversing the ARGB.
+static const uvec8 kShuffleMirrorARGB = {12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u,
+ 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u};
+
+void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+ asm volatile(
+ // Start at end of source row.
+ "ld1 {v4.16b}, [%3] \n" // shuffler
+ "add %0, %0, %w2, sxtw #2 \n"
+ "sub %0, %0, #32 \n"
+ "1: \n"
+ "ldr q1, [%0, 16] \n"
+ "ldr q0, [%0], -32 \n" // src -= 32
+ "subs %w2, %w2, #8 \n" // 8 pixels per loop.
+ "tbl v2.16b, {v1.16b}, v4.16b \n"
+ "tbl v3.16b, {v0.16b}, v4.16b \n"
+ "st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(&kShuffleMirrorARGB) // %3
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+
+void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
+ uint8_t* dst_rgb24,
+ int width) {
+ asm volatile(
+ "ld1 {v3.16b}, [%4] \n" // shuffler
+ "add %0, %0, %w2, sxtw #1 \n" // Start at end of row.
+ "add %0, %0, %w2, sxtw \n"
+ "sub %0, %0, #48 \n"
+
+ "1: \n"
+ "ld3 {v0.16b, v1.16b, v2.16b}, [%0], %3 \n" // src -= 48
+ "subs %w2, %w2, #16 \n" // 16 pixels per loop.
+ "tbl v0.16b, {v0.16b}, v3.16b \n"
+ "tbl v1.16b, {v1.16b}, v3.16b \n"
+ "tbl v2.16b, {v2.16b}, v3.16b \n"
+ "st3 {v0.16b, v1.16b, v2.16b}, [%1], #48 \n" // dst += 48
+ "b.gt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(width) // %2
+ : "r"((ptrdiff_t)-48), // %3
+ "r"(&kShuffleMirror) // %4
+ : "cc", "memory", "v0", "v1", "v2", "v3");
}
void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
uint8_t* dst_argb,
int width) {
asm volatile(
- "movi v4.8b, #255 \n" // Alpha
+ "movi v4.8b, #255 \n" // Alpha
"1: \n"
- "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
+ "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of
+ // RGB24.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -805,14 +922,15 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
asm volatile(
- "movi v5.8b, #255 \n" // Alpha
+ "movi v5.8b, #255 \n" // Alpha
"1: \n"
- "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "orr v3.8b, v1.8b, v1.8b \n" // move g
- "orr v4.8b, v0.8b, v0.8b \n" // move r
- "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a
- "b.gt 1b \n"
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "orr v3.8b, v1.8b, v1.8b \n" // move g
+ "orr v4.8b, v0.8b, v0.8b \n" // move r
+ "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a
+ "b.gt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -823,14 +941,15 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
asm volatile(
- "movi v0.8b, #255 \n" // Alpha
+ "movi v0.8b, #255 \n" // Alpha
"1: \n"
- "ld3 {v3.8b,v4.8b,v5.8b}, [%0], #24 \n" // read r g b
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "orr v2.8b, v4.8b, v4.8b \n" // move g
- "orr v1.8b, v5.8b, v5.8b \n" // move r
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store a b g r
- "b.gt 1b \n"
+ "ld3 {v3.8b,v4.8b,v5.8b}, [%0], #24 \n" // read r g b
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "orr v2.8b, v4.8b, v4.8b \n" // move g
+ "orr v1.8b, v5.8b, v5.8b \n" // move r
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store a b g r
+ "b.gt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_rgba), // %1
"+r"(width) // %2
@@ -842,12 +961,13 @@ void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
asm volatile(
"1: \n"
- "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "orr v3.8b, v1.8b, v1.8b \n" // move g
- "orr v4.8b, v0.8b, v0.8b \n" // move r
- "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r
- "b.gt 1b \n"
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "orr v3.8b, v1.8b, v1.8b \n" // move g
+ "orr v4.8b, v0.8b, v0.8b \n" // move r
+ "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r
+ "b.gt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_rgb24), // %1
"+r"(width) // %2
@@ -873,13 +993,14 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
uint8_t* dst_argb,
int width) {
asm volatile(
- "movi v3.8b, #255 \n" // Alpha
+ "movi v3.8b, #255 \n" // Alpha
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
RGB565TOARGB
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_rgb565), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -929,14 +1050,14 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_argb,
int width) {
asm volatile(
- "movi v3.8b, #255 \n" // Alpha
+ "movi v3.8b, #255 \n" // Alpha
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
- // pixels
- "b.gt 1b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_argb1555), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -945,6 +1066,8 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
);
}
+// Convert v0.8h to b = v0.8b g = v1.8b r = v2.8b
+// clobbers v3
#define ARGB4444TOARGB \
"shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \
"xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \
@@ -962,12 +1085,12 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
int width) {
asm volatile(
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
- // pixels
- "b.gt 1b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_argb4444), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -981,11 +1104,12 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
int width) {
asm volatile(
"1: \n"
- "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of
- // RGB24.
- "b.gt 1b \n"
+ "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of
+ // RGB24
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_rgb24), // %1
"+r"(width) // %2
@@ -997,12 +1121,13 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
asm volatile(
"1: \n"
- "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "orr v4.8b, v2.8b, v2.8b \n" // mov g
- "orr v5.8b, v1.8b, v1.8b \n" // mov b
- "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b
- "b.gt 1b \n"
+ "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "orr v4.8b, v2.8b, v2.8b \n" // mov g
+ "orr v5.8b, v1.8b, v1.8b \n" // mov b
+ "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_raw), // %1
"+r"(width) // %2
@@ -1014,10 +1139,11 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
asm volatile(
"1: \n"
- "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
- "subs %w2, %w2, #16 \n" // 16 processed per loop.
- "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
- "b.gt 1b \n"
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #16 \n" // 16 processed per loop.
+ "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
+ "b.gt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1029,10 +1155,11 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
asm volatile(
"1: \n"
- "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
- "subs %w2, %w2, #16 \n" // 16 processed per loop.
- "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
- "b.gt 1b \n"
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #16 \n" // 16 processed per loop.
+ "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
+ "b.gt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1047,11 +1174,12 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
int width) {
asm volatile(
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2
- "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
- "st1 {v1.8b}, [%1], #8 \n" // store 8 U.
- "st1 {v3.8b}, [%2], #8 \n" // store 8 V.
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
+ "st1 {v1.8b}, [%1], #8 \n" // store 8 U.
+ "st1 {v3.8b}, [%2], #8 \n" // store 8 V.
+ "b.gt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -1067,11 +1195,12 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
int width) {
asm volatile(
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY
- "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
- "st1 {v0.8b}, [%1], #8 \n" // store 8 U.
- "st1 {v2.8b}, [%2], #8 \n" // store 8 V.
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 U.
+ "st1 {v2.8b}, [%2], #8 \n" // store 8 V.
+ "b.gt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -1089,14 +1218,15 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2;
asm volatile(
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
- "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
- "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
- "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
- "st1 {v1.8b}, [%2], #8 \n" // store 8 U.
- "st1 {v3.8b}, [%3], #8 \n" // store 8 V.
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
+ "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
+ "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
+ "st1 {v1.8b}, [%2], #8 \n" // store 8 U.
+ "st1 {v3.8b}, [%3], #8 \n" // store 8 V.
+ "b.gt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(src_yuy2b), // %1
"+r"(dst_u), // %2
@@ -1116,14 +1246,15 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
const uint8_t* src_uyvyb = src_uyvy + stride_uyvy;
asm volatile(
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
- "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
- "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
- "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
- "st1 {v0.8b}, [%2], #8 \n" // store 8 U.
- "st1 {v2.8b}, [%3], #8 \n" // store 8 V.
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
+ "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
+ "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 U.
+ "st1 {v2.8b}, [%3], #8 \n" // store 8 V.
+ "b.gt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(src_uyvyb), // %1
"+r"(dst_u), // %2
@@ -1141,13 +1272,14 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb,
const uint8_t* shuffler,
int width) {
asm volatile(
- "ld1 {v2.16b}, [%3] \n" // shuffler
+ "ld1 {v2.16b}, [%3] \n" // shuffler
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
- "subs %w2, %w2, #4 \n" // 4 processed per loop
- "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
- "st1 {v1.16b}, [%1], #16 \n" // store 4.
- "b.gt 1b \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #4 \n" // 4 processed per loop
+ "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
+ "st1 {v1.16b}, [%1], #16 \n" // store 4.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -1163,13 +1295,14 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y,
int width) {
asm volatile(
"1: \n"
- "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
- "orr v2.8b, v1.8b, v1.8b \n"
- "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
- "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
- "subs %w4, %w4, #16 \n" // 16 pixels
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
- "b.gt 1b \n"
+ "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
+ "prfm pldl1keep, [%0, 448] \n"
+ "orr v2.8b, v1.8b, v1.8b \n"
+ "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
+ "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
+ "subs %w4, %w4, #16 \n" // 16 pixels
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -1186,13 +1319,14 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y,
int width) {
asm volatile(
"1: \n"
- "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys
- "orr v3.8b, v2.8b, v2.8b \n"
- "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
- "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
- "subs %w4, %w4, #16 \n" // 16 pixels
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
- "b.gt 1b \n"
+ "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys
+ "prfm pldl1keep, [%0, 448] \n"
+ "orr v3.8b, v2.8b, v2.8b \n"
+ "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
+ "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
+ "subs %w4, %w4, #16 \n" // 16 pixels
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -1207,11 +1341,13 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
int width) {
asm volatile(
"1: \n"
- "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8
+ // pixels
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGBTORGB565
- "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565.
- "b.gt 1b \n"
+ "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_rgb565), // %1
"+r"(width) // %2
@@ -1224,15 +1360,17 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
const uint32_t dither4,
int width) {
asm volatile(
- "dup v1.4s, %w2 \n" // dither4
+ "dup v1.4s, %w2 \n" // dither4
"1: \n"
- "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "uqadd v20.8b, v20.8b, v1.8b \n"
- "uqadd v21.8b, v21.8b, v1.8b \n"
- "uqadd v22.8b, v22.8b, v1.8b \n" ARGBTORGB565
- "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565.
- "b.gt 1b \n"
+ "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8
+ // pixels
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uqadd v20.8b, v20.8b, v1.8b \n"
+ "uqadd v21.8b, v21.8b, v1.8b \n"
+ "uqadd v22.8b, v22.8b, v1.8b \n" ARGBTORGB565
+ "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565.
+ "b.gt 1b \n"
: "+r"(dst_rgb) // %0
: "r"(src_argb), // %1
"r"(dither4), // %2
@@ -1245,12 +1383,13 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
int width) {
asm volatile(
"1: \n"
- "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8
+ // pixels
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGBTOARGB1555
- "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
- // ARGB1555.
- "b.gt 1b \n"
+ "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb1555), // %1
"+r"(width) // %2
@@ -1262,15 +1401,16 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
uint8_t* dst_argb4444,
int width) {
asm volatile(
- "movi v4.16b, #0x0f \n" // bits to clear with
+ "movi v4.16b, #0x0f \n" // bits to clear with
// vbic.
"1: \n"
- "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8
+ // pixels
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGBTOARGB4444
- "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
- // ARGB4444.
- "b.gt 1b \n"
+ "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb4444), // %1
"+r"(width) // %2
@@ -1280,20 +1420,21 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile(
- "movi v4.8b, #25 \n" // B * 0.1016 coefficient
- "movi v5.8b, #129 \n" // G * 0.5078 coefficient
- "movi v6.8b, #66 \n" // R * 0.2578 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
+ "movi v4.8b, #25 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #129 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #66 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v3.8h, v0.8b, v4.8b \n" // B
- "umlal v3.8h, v1.8b, v5.8b \n" // G
- "umlal v3.8h, v2.8b, v6.8b \n" // R
- "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v3.8h, v0.8b, v4.8b \n" // B
+ "umlal v3.8h, v1.8b, v5.8b \n" // G
+ "umlal v3.8h, v2.8b, v6.8b \n" // R
+ "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1306,11 +1447,11 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
int width) {
asm volatile(
"1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
- // pixels
- "subs %w2, %w2, #16 \n" // 16 processed per loop
- "st1 {v3.16b}, [%1], #16 \n" // store 16 A's.
- "b.gt 1b \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #16 \n" // 16 processed per loop
+ "st1 {v3.16b}, [%1], #16 \n" // store 16 A's.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_a), // %1
"+r"(width) // %2
@@ -1321,18 +1462,19 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile(
- "movi v4.8b, #29 \n" // B * 0.1140 coefficient
- "movi v5.8b, #150 \n" // G * 0.5870 coefficient
- "movi v6.8b, #77 \n" // R * 0.2990 coefficient
+ "movi v4.8b, #29 \n" // B * 0.1140 coefficient
+ "movi v5.8b, #150 \n" // G * 0.5870 coefficient
+ "movi v6.8b, #77 \n" // R * 0.2990 coefficient
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v3.8h, v0.8b, v4.8b \n" // B
- "umlal v3.8h, v1.8b, v5.8b \n" // G
- "umlal v3.8h, v2.8b, v6.8b \n" // R
- "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v3.8h, v0.8b, v4.8b \n" // B
+ "umlal v3.8h, v1.8b, v5.8b \n" // G
+ "umlal v3.8h, v2.8b, v6.8b \n" // R
+ "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1342,18 +1484,19 @@ void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
void RGBAToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile(
- "movi v4.8b, #29 \n" // B * 0.1140 coefficient
- "movi v5.8b, #150 \n" // G * 0.5870 coefficient
- "movi v6.8b, #77 \n" // R * 0.2990 coefficient
+ "movi v4.8b, #29 \n" // B * 0.1140 coefficient
+ "movi v5.8b, #150 \n" // G * 0.5870 coefficient
+ "movi v6.8b, #77 \n" // R * 0.2990 coefficient
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 RGBA
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v0.8h, v1.8b, v4.8b \n" // B
- "umlal v0.8h, v2.8b, v5.8b \n" // G
- "umlal v0.8h, v3.8b, v6.8b \n" // R
- "uqrshrn v3.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y
- "st1 {v3.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 RGBA
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v0.8h, v1.8b, v4.8b \n" // B
+ "umlal v0.8h, v2.8b, v5.8b \n" // G
+ "umlal v0.8h, v3.8b, v6.8b \n" // R
+ "uqrshrn v3.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y
+ "st1 {v3.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1367,33 +1510,33 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb,
uint8_t* dst_v,
int width) {
asm volatile(
- "movi v24.8b, #112 \n" // UB / VR 0.875
+ "movi v24.8b, #112 \n" // UB / VR 0.875
// coefficient
- "movi v25.8b, #74 \n" // UG -0.5781 coefficient
- "movi v26.8b, #38 \n" // UR -0.2969 coefficient
- "movi v27.8b, #18 \n" // VB -0.1406 coefficient
- "movi v28.8b, #94 \n" // VG -0.7344 coefficient
- "movi v29.16b,#0x80 \n" // 128.5
+ "movi v25.8b, #74 \n" // UG -0.5781 coefficient
+ "movi v26.8b, #38 \n" // UR -0.2969 coefficient
+ "movi v27.8b, #18 \n" // VB -0.1406 coefficient
+ "movi v28.8b, #94 \n" // VG -0.7344 coefficient
+ "movi v29.16b,#0x80 \n" // 128.5
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- // pixels.
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "umull v4.8h, v0.8b, v24.8b \n" // B
- "umlsl v4.8h, v1.8b, v25.8b \n" // G
- "umlsl v4.8h, v2.8b, v26.8b \n" // R
- "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "umull v4.8h, v0.8b, v24.8b \n" // B
+ "umlsl v4.8h, v1.8b, v25.8b \n" // G
+ "umlsl v4.8h, v2.8b, v26.8b \n" // R
+ "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned
- "umull v3.8h, v2.8b, v24.8b \n" // R
- "umlsl v3.8h, v1.8b, v28.8b \n" // G
- "umlsl v3.8h, v0.8b, v27.8b \n" // B
- "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned
+ "umull v3.8h, v2.8b, v24.8b \n" // R
+ "umlsl v3.8h, v1.8b, v28.8b \n" // G
+ "umlsl v3.8h, v0.8b, v27.8b \n" // B
+ "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned
- "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U
- "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
+ "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U
+ "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -1437,26 +1580,28 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb_1 = src_argb + src_stride_argb;
asm volatile (
RGBTOUV_SETUP_REG
- "1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
- "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
- "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
- "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
- "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
- "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
- "urshr v0.8h, v0.8h, #1 \n" // 2x average
- "urshr v1.8h, v1.8h, #1 \n"
- "urshr v2.8h, v2.8h, #1 \n"
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
- "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(src_argb_1), // %1
"+r"(dst_u), // %2
@@ -1468,7 +1613,6 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
);
}
-// TODO(fbarchard): Subsample match C code.
void ARGBToUVJRow_NEON(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
@@ -1476,31 +1620,33 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
int width) {
const uint8_t* src_argb_1 = src_argb + src_stride_argb;
asm volatile (
- "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2
- "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2
- "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2
- "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2
- "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2
- "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
- "1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
- "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
- "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
- "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
- "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
- "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
+ "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2
+ "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2
+ "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2
+ "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2
+ "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2
+ "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
- "urshr v0.8h, v0.8h, #1 \n" // 2x average
- "urshr v1.8h, v1.8h, #1 \n"
- "urshr v2.8h, v2.8h, #1 \n"
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
- "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ "subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(src_argb_1), // %1
"+r"(dst_u), // %2
@@ -1520,25 +1666,27 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra,
const uint8_t* src_bgra_1 = src_bgra + src_stride_bgra;
asm volatile (
RGBTOUV_SETUP_REG
- "1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
- "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts.
- "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more
- "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts.
- "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
- "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts.
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts.
- "urshr v0.8h, v0.8h, #1 \n" // 2x average
- "urshr v1.8h, v3.8h, #1 \n"
- "urshr v2.8h, v2.8h, #1 \n"
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v3.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
- "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ "subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_bgra), // %0
"+r"(src_bgra_1), // %1
"+r"(dst_u), // %2
@@ -1558,25 +1706,27 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr,
const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr;
asm volatile (
RGBTOUV_SETUP_REG
- "1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
- "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
- "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
- "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
- "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
- "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
- "urshr v0.8h, v3.8h, #1 \n" // 2x average
- "urshr v2.8h, v2.8h, #1 \n"
- "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v0.8h, v3.8h, #1 \n" // 2x average
+ "urshr v2.8h, v2.8h, #1 \n"
+ "urshr v1.8h, v1.8h, #1 \n"
- "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ "subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v0.8h, v2.8h, v1.8h)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_abgr), // %0
"+r"(src_abgr_1), // %1
"+r"(dst_u), // %2
@@ -1596,25 +1746,27 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba,
const uint8_t* src_rgba_1 = src_rgba + src_stride_rgba;
asm volatile (
RGBTOUV_SETUP_REG
- "1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
- "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts.
- "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
- "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts.
- "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
- "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts.
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts.
- "urshr v0.8h, v0.8h, #1 \n" // 2x average
- "urshr v1.8h, v1.8h, #1 \n"
- "urshr v2.8h, v2.8h, #1 \n"
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
- "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ "subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_rgba), // %0
"+r"(src_rgba_1), // %1
"+r"(dst_u), // %2
@@ -1634,25 +1786,27 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
asm volatile (
RGBTOUV_SETUP_REG
- "1: \n"
- "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
- "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
- "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more.
- "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
- "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
- "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
+ "1: \n"
+ "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+ "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more.
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
- "urshr v0.8h, v0.8h, #1 \n" // 2x average
- "urshr v1.8h, v1.8h, #1 \n"
- "urshr v2.8h, v2.8h, #1 \n"
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
- "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ "subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(src_rgb24_1), // %1
"+r"(dst_u), // %2
@@ -1672,25 +1826,27 @@ void RAWToUVRow_NEON(const uint8_t* src_raw,
const uint8_t* src_raw_1 = src_raw + src_stride_raw;
asm volatile (
RGBTOUV_SETUP_REG
- "1: \n"
- "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels.
- "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
- "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels
- "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
- "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
- "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
+ "1: \n"
+ "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
+ "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
- "urshr v2.8h, v2.8h, #1 \n" // 2x average
- "urshr v1.8h, v1.8h, #1 \n"
- "urshr v0.8h, v0.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v0.8h, v0.8h, #1 \n"
- "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ "subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v2.8h, v1.8h, v0.8h)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_raw), // %0
"+r"(src_raw_1), // %1
"+r"(dst_u), // %2
@@ -1702,7 +1858,7 @@ void RAWToUVRow_NEON(const uint8_t* src_raw,
);
}
-// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+// 16x2 pixels -> 8x1. width is number of rgb pixels. e.g. 16.
void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
int src_stride_rgb565,
uint8_t* dst_u,
@@ -1710,67 +1866,54 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
int width) {
const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
asm volatile(
- "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) /
- // 2
- "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2
- "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2
- "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2
- "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2
- "movi v27.16b, #0x80 \n" // 128.5 0x8080 in 16bit
+ RGBTOUV_SETUP_REG
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
RGB565TOARGB
- "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uaddlp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uaddlp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels.
+ "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels.
RGB565TOARGB
- "uaddlp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uaddlp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uaddlp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels.
+ "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels.
+ "prfm pldl1keep, [%1, 448] \n"
RGB565TOARGB
- "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uadalp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uadalp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels.
+ "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels.
RGB565TOARGB
- "uadalp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uadalp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uadalp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ins v16.D[1], v17.D[0] \n"
- "ins v18.D[1], v19.D[0] \n"
- "ins v20.D[1], v21.D[0] \n"
+ "ins v16.D[1], v26.D[0] \n"
+ "ins v17.D[1], v27.D[0] \n"
+ "ins v18.D[1], v28.D[0] \n"
- "urshr v4.8h, v16.8h, #1 \n" // 2x average
- "urshr v5.8h, v18.8h, #1 \n"
- "urshr v6.8h, v20.8h, #1 \n"
+ "urshr v0.8h, v16.8h, #1 \n" // 2x average
+ "urshr v1.8h, v17.8h, #1 \n"
+ "urshr v2.8h, v18.8h, #1 \n"
- "subs %w4, %w4, #16 \n" // 16 processed per loop.
- "mul v16.8h, v4.8h, v22.8h \n" // B
- "mls v16.8h, v5.8h, v23.8h \n" // G
- "mls v16.8h, v6.8h, v24.8h \n" // R
- "add v16.8h, v16.8h, v27.8h \n" // +128 -> unsigned
- "mul v17.8h, v6.8h, v22.8h \n" // R
- "mls v17.8h, v5.8h, v26.8h \n" // G
- "mls v17.8h, v4.8h, v25.8h \n" // B
- "add v17.8h, v17.8h, v27.8h \n" // +128 -> unsigned
- "uqshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit U
- "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
+ RGBTOUV(v0.8h, v1.8h, v2.8h)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_rgb565), // %0
"+r"(src_rgb565_1), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(width) // %4
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
:
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
- "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
- "v27");
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
+ "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
+ "v28");
}
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
@@ -1783,50 +1926,43 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
asm volatile(
RGBTOUV_SETUP_REG
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
RGB555TOARGB
- "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels.
+ "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels.
RGB555TOARGB
- "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels.
+ "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels.
+ "prfm pldl1keep, [%1, 448] \n"
RGB555TOARGB
- "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels.
+ "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels.
RGB555TOARGB
- "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ins v16.D[1], v26.D[0] \n"
- "ins v17.D[1], v27.D[0] \n"
- "ins v18.D[1], v28.D[0] \n"
+ "ins v16.D[1], v26.D[0] \n"
+ "ins v17.D[1], v27.D[0] \n"
+ "ins v18.D[1], v28.D[0] \n"
- "urshr v4.8h, v16.8h, #1 \n" // 2x average
- "urshr v5.8h, v17.8h, #1 \n"
- "urshr v6.8h, v18.8h, #1 \n"
+ "urshr v0.8h, v16.8h, #1 \n" // 2x average
+ "urshr v1.8h, v17.8h, #1 \n"
+ "urshr v2.8h, v18.8h, #1 \n"
- "subs %w4, %w4, #16 \n" // 16 processed per loop.
- "mul v2.8h, v4.8h, v20.8h \n" // B
- "mls v2.8h, v5.8h, v21.8h \n" // G
- "mls v2.8h, v6.8h, v22.8h \n" // R
- "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned
- "mul v3.8h, v6.8h, v20.8h \n" // R
- "mls v3.8h, v5.8h, v24.8h \n" // G
- "mls v3.8h, v4.8h, v23.8h \n" // B
- "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
- "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U
- "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
+ RGBTOUV(v0.8h, v1.8h, v2.8h)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_argb1555), // %0
"+r"(src_argb1555_1), // %1
"+r"(dst_u), // %2
@@ -1846,52 +1982,45 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
int width) {
const uint8_t* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
asm volatile(
- RGBTOUV_SETUP_REG
+ RGBTOUV_SETUP_REG // sets v20-v25
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
ARGB4444TOARGB
- "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels.
+ "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels.
ARGB4444TOARGB
- "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels.
+ "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels.
+ "prfm pldl1keep, [%1, 448] \n"
ARGB4444TOARGB
- "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels.
+ "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels.
ARGB4444TOARGB
- "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ins v16.D[1], v26.D[0] \n"
- "ins v17.D[1], v27.D[0] \n"
- "ins v18.D[1], v28.D[0] \n"
+ "ins v16.D[1], v26.D[0] \n"
+ "ins v17.D[1], v27.D[0] \n"
+ "ins v18.D[1], v28.D[0] \n"
- "urshr v4.8h, v16.8h, #1 \n" // 2x average
- "urshr v5.8h, v17.8h, #1 \n"
- "urshr v6.8h, v18.8h, #1 \n"
+ "urshr v0.8h, v16.8h, #1 \n" // 2x average
+ "urshr v1.8h, v17.8h, #1 \n"
+ "urshr v2.8h, v18.8h, #1 \n"
- "subs %w4, %w4, #16 \n" // 16 processed per loop.
- "mul v2.8h, v4.8h, v20.8h \n" // B
- "mls v2.8h, v5.8h, v21.8h \n" // G
- "mls v2.8h, v6.8h, v22.8h \n" // R
- "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned
- "mul v3.8h, v6.8h, v20.8h \n" // R
- "mls v3.8h, v5.8h, v24.8h \n" // G
- "mls v3.8h, v4.8h, v23.8h \n" // B
- "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
- "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U
- "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
+ RGBTOUV(v0.8h, v1.8h, v2.8h)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_argb4444), // %0
"+r"(src_argb4444_1), // %1
"+r"(dst_u), // %2
@@ -1907,21 +2036,22 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
asm volatile(
- "movi v24.8b, #25 \n" // B * 0.1016 coefficient
- "movi v25.8b, #129 \n" // G * 0.5078 coefficient
- "movi v26.8b, #66 \n" // R * 0.2578 coefficient
- "movi v27.8b, #16 \n" // Add 16 constant
+ "movi v24.8b, #25 \n" // B * 0.1016 coefficient
+ "movi v25.8b, #129 \n" // G * 0.5078 coefficient
+ "movi v26.8b, #66 \n" // R * 0.2578 coefficient
+ "movi v27.8b, #16 \n" // Add 16 constant
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
RGB565TOARGB
- "umull v3.8h, v0.8b, v24.8b \n" // B
- "umlal v3.8h, v1.8b, v25.8b \n" // G
- "umlal v3.8h, v2.8b, v26.8b \n" // R
- "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v27.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
+ "umull v3.8h, v0.8b, v24.8b \n" // B
+ "umlal v3.8h, v1.8b, v25.8b \n" // G
+ "umlal v3.8h, v2.8b, v26.8b \n" // R
+ "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v27.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_rgb565), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1934,21 +2064,22 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_y,
int width) {
asm volatile(
- "movi v4.8b, #25 \n" // B * 0.1016 coefficient
- "movi v5.8b, #129 \n" // G * 0.5078 coefficient
- "movi v6.8b, #66 \n" // R * 0.2578 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
+ "movi v4.8b, #25 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #129 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #66 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB
- "umull v3.8h, v0.8b, v4.8b \n" // B
- "umlal v3.8h, v1.8b, v5.8b \n" // G
- "umlal v3.8h, v2.8b, v6.8b \n" // R
- "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
+ "umull v3.8h, v0.8b, v4.8b \n" // B
+ "umlal v3.8h, v1.8b, v5.8b \n" // G
+ "umlal v3.8h, v2.8b, v6.8b \n" // R
+ "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_argb1555), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1960,21 +2091,22 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
uint8_t* dst_y,
int width) {
asm volatile(
- "movi v24.8b, #25 \n" // B * 0.1016 coefficient
- "movi v25.8b, #129 \n" // G * 0.5078 coefficient
- "movi v26.8b, #66 \n" // R * 0.2578 coefficient
- "movi v27.8b, #16 \n" // Add 16 constant
+ "movi v24.8b, #25 \n" // B * 0.1016 coefficient
+ "movi v25.8b, #129 \n" // G * 0.5078 coefficient
+ "movi v26.8b, #66 \n" // R * 0.2578 coefficient
+ "movi v27.8b, #16 \n" // Add 16 constant
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB
- "umull v3.8h, v0.8b, v24.8b \n" // B
- "umlal v3.8h, v1.8b, v25.8b \n" // G
- "umlal v3.8h, v2.8b, v26.8b \n" // R
- "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v27.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
+ "umull v3.8h, v0.8b, v24.8b \n" // B
+ "umlal v3.8h, v1.8b, v25.8b \n" // G
+ "umlal v3.8h, v2.8b, v26.8b \n" // R
+ "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v27.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_argb4444), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1984,20 +2116,21 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
asm volatile(
- "movi v4.8b, #66 \n" // R * 0.2578 coefficient
- "movi v5.8b, #129 \n" // G * 0.5078 coefficient
- "movi v6.8b, #25 \n" // B * 0.1016 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
+ "movi v4.8b, #66 \n" // R * 0.2578 coefficient
+ "movi v5.8b, #129 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #25 \n" // B * 0.1016 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v16.8h, v1.8b, v4.8b \n" // R
- "umlal v16.8h, v2.8b, v5.8b \n" // G
- "umlal v16.8h, v3.8b, v6.8b \n" // B
- "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v1.8b, v4.8b \n" // R
+ "umlal v16.8h, v2.8b, v5.8b \n" // G
+ "umlal v16.8h, v3.8b, v6.8b \n" // B
+ "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_bgra), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -2007,20 +2140,21 @@ void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
asm volatile(
- "movi v6.8b, #25 \n" // B * 0.1016 coefficient
- "movi v5.8b, #129 \n" // G * 0.5078 coefficient
- "movi v4.8b, #66 \n" // R * 0.2578 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
+ "movi v6.8b, #25 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #129 \n" // G * 0.5078 coefficient
+ "movi v4.8b, #66 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v16.8h, v0.8b, v4.8b \n" // R
- "umlal v16.8h, v1.8b, v5.8b \n" // G
- "umlal v16.8h, v2.8b, v6.8b \n" // B
- "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v0.8b, v4.8b \n" // R
+ "umlal v16.8h, v1.8b, v5.8b \n" // G
+ "umlal v16.8h, v2.8b, v6.8b \n" // B
+ "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_abgr), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -2030,20 +2164,21 @@ void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
asm volatile(
- "movi v4.8b, #25 \n" // B * 0.1016 coefficient
- "movi v5.8b, #129 \n" // G * 0.5078 coefficient
- "movi v6.8b, #66 \n" // R * 0.2578 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
+ "movi v4.8b, #25 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #129 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #66 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v16.8h, v1.8b, v4.8b \n" // B
- "umlal v16.8h, v2.8b, v5.8b \n" // G
- "umlal v16.8h, v3.8b, v6.8b \n" // R
- "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v1.8b, v4.8b \n" // B
+ "umlal v16.8h, v2.8b, v5.8b \n" // G
+ "umlal v16.8h, v3.8b, v6.8b \n" // R
+ "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_rgba), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -2053,20 +2188,21 @@ void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
asm volatile(
- "movi v4.8b, #25 \n" // B * 0.1016 coefficient
- "movi v5.8b, #129 \n" // G * 0.5078 coefficient
- "movi v6.8b, #66 \n" // R * 0.2578 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
+ "movi v4.8b, #25 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #129 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #66 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
"1: \n"
- "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v16.8h, v0.8b, v4.8b \n" // B
- "umlal v16.8h, v1.8b, v5.8b \n" // G
- "umlal v16.8h, v2.8b, v6.8b \n" // R
- "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v0.8b, v4.8b \n" // B
+ "umlal v16.8h, v1.8b, v5.8b \n" // G
+ "umlal v16.8h, v2.8b, v6.8b \n" // R
+ "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -2076,20 +2212,21 @@ void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
asm volatile(
- "movi v6.8b, #25 \n" // B * 0.1016 coefficient
- "movi v5.8b, #129 \n" // G * 0.5078 coefficient
- "movi v4.8b, #66 \n" // R * 0.2578 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
+ "movi v6.8b, #25 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #129 \n" // G * 0.5078 coefficient
+ "movi v4.8b, #66 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
"1: \n"
- "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v16.8h, v0.8b, v4.8b \n" // B
- "umlal v16.8h, v1.8b, v5.8b \n" // G
- "umlal v16.8h, v2.8b, v6.8b \n" // R
- "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v0.8b, v4.8b \n" // B
+ "umlal v16.8h, v1.8b, v5.8b \n" // G
+ "umlal v16.8h, v2.8b, v6.8b \n" // R
+ "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -2097,6 +2234,50 @@ void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
}
+void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+ asm volatile(
+ "movi v4.8b, #29 \n" // B * 0.1140 coefficient
+ "movi v5.8b, #150 \n" // G * 0.5870 coefficient
+ "movi v6.8b, #77 \n" // R * 0.2990 coefficient
+ "1: \n"
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v0.8h, v0.8b, v4.8b \n" // B
+ "umlal v0.8h, v1.8b, v5.8b \n" // G
+ "umlal v0.8h, v2.8b, v6.8b \n" // R
+ "uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_yj), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
+}
+
+void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+ asm volatile(
+ "movi v6.8b, #29 \n" // B * 0.1140 coefficient
+ "movi v5.8b, #150 \n" // G * 0.5870 coefficient
+ "movi v4.8b, #77 \n" // R * 0.2990 coefficient
+ "1: \n"
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v0.8h, v0.8b, v4.8b \n" // B
+ "umlal v0.8h, v1.8b, v5.8b \n" // G
+ "umlal v0.8h, v2.8b, v6.8b \n" // R
+ "uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_yj), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
+}
+
// Bilinear filter 16x2 -> 16x1
void InterpolateRow_NEON(uint8_t* dst_ptr,
const uint8_t* src_ptr,
@@ -2107,44 +2288,49 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
int y0_fraction = 256 - y1_fraction;
const uint8_t* src_ptr1 = src_ptr + src_stride;
asm volatile(
- "cmp %w4, #0 \n"
- "b.eq 100f \n"
- "cmp %w4, #128 \n"
- "b.eq 50f \n"
+ "cmp %w4, #0 \n"
+ "b.eq 100f \n"
+ "cmp %w4, #128 \n"
+ "b.eq 50f \n"
- "dup v5.16b, %w4 \n"
- "dup v4.16b, %w5 \n"
+ "dup v5.16b, %w4 \n"
+ "dup v4.16b, %w5 \n"
// General purpose row blend.
"1: \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- "ld1 {v1.16b}, [%2], #16 \n"
- "subs %w3, %w3, #16 \n"
- "umull v2.8h, v0.8b, v4.8b \n"
- "umull2 v3.8h, v0.16b, v4.16b \n"
- "umlal v2.8h, v1.8b, v5.8b \n"
- "umlal2 v3.8h, v1.16b, v5.16b \n"
- "rshrn v0.8b, v2.8h, #8 \n"
- "rshrn2 v0.16b, v3.8h, #8 \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 1b \n"
- "b 99f \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "subs %w3, %w3, #16 \n"
+ "umull v2.8h, v0.8b, v4.8b \n"
+ "umull2 v3.8h, v0.16b, v4.16b \n"
+ "umlal v2.8h, v1.8b, v5.8b \n"
+ "umlal2 v3.8h, v1.16b, v5.16b \n"
+ "rshrn v0.8b, v2.8h, #8 \n"
+ "rshrn2 v0.16b, v3.8h, #8 \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 1b \n"
+ "b 99f \n"
// Blend 50 / 50.
"50: \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- "ld1 {v1.16b}, [%2], #16 \n"
- "subs %w3, %w3, #16 \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 50b \n"
- "b 99f \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "subs %w3, %w3, #16 \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 50b \n"
+ "b 99f \n"
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- "subs %w3, %w3, #16 \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 100b \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #16 \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 100b \n"
"99: \n"
: "+r"(dst_ptr), // %0
@@ -2163,56 +2349,60 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb0,
uint8_t* dst_argb,
int width) {
asm volatile(
- "subs %w3, %w3, #8 \n"
- "b.lt 89f \n"
+ "subs %w3, %w3, #8 \n"
+ "b.lt 89f \n"
// Blend 8 pixels.
"8: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0
- // pixels
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1
- // pixels
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "umull v16.8h, v4.8b, v3.8b \n" // db * a
- "umull v17.8h, v5.8b, v3.8b \n" // dg * a
- "umull v18.8h, v6.8b, v3.8b \n" // dr * a
- "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
- "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
- "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
- "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
- "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
- "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
- "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
- "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
- "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
- "movi v3.8b, #255 \n" // a = 255
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
- // pixels
- "b.ge 8b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v4.8b, v3.8b \n" // db * a
+ "umull v17.8h, v5.8b, v3.8b \n" // dg * a
+ "umull v18.8h, v6.8b, v3.8b \n" // dr * a
+ "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
+ "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
+ "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
+ "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
+ "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
+ "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
+ "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
+ "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
+ "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
+ "movi v3.8b, #255 \n" // a = 255
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ // pixels
+ "b.ge 8b \n"
"89: \n"
- "adds %w3, %w3, #8-1 \n"
- "b.lt 99f \n"
+ "adds %w3, %w3, #8-1 \n"
+ "b.lt 99f \n"
// Blend 1 pixels.
"1: \n"
- "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0.
- "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1.
- "subs %w3, %w3, #1 \n" // 1 processed per loop.
- "umull v16.8h, v4.8b, v3.8b \n" // db * a
- "umull v17.8h, v5.8b, v3.8b \n" // dg * a
- "umull v18.8h, v6.8b, v3.8b \n" // dr * a
- "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
- "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
- "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
- "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
- "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
- "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
- "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
- "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
- "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
- "movi v3.8b, #255 \n" // a = 255
- "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel.
- "b.ge 1b \n"
+ "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel
+ // ARGB0.
+ "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel
+ // ARGB1.
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #1 \n" // 1 processed per loop.
+ "umull v16.8h, v4.8b, v3.8b \n" // db * a
+ "umull v17.8h, v5.8b, v3.8b \n" // dg * a
+ "umull v18.8h, v6.8b, v3.8b \n" // dr * a
+ "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
+ "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
+ "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
+ "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
+ "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
+ "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
+ "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
+ "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
+ "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
+ "movi v3.8b, #255 \n" // a = 255
+ "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel.
+ "b.ge 1b \n"
"99: \n"
@@ -2232,17 +2422,17 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
asm volatile(
// Attenuate 8 pixels.
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v4.8h, v0.8b, v3.8b \n" // b * a
- "umull v5.8h, v1.8b, v3.8b \n" // g * a
- "umull v6.8h, v2.8b, v3.8b \n" // r * a
- "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8
- "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8
- "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
- // pixels
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v4.8h, v0.8b, v3.8b \n" // b * a
+ "umull v5.8h, v1.8b, v3.8b \n" // g * a
+ "umull v6.8h, v2.8b, v3.8b \n" // r * a
+ "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8
+ "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8
+ "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -2258,32 +2448,33 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
int interval_offset,
int width) {
asm volatile(
- "dup v4.8h, %w2 \n"
- "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1
- "dup v5.8h, %w3 \n" // interval multiply.
- "dup v6.8h, %w4 \n" // interval add
+ "dup v4.8h, %w2 \n"
+ "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1
+ "dup v5.8h, %w3 \n" // interval multiply.
+ "dup v6.8h, %w4 \n" // interval add
// 8 pixel loop.
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB.
- "subs %w1, %w1, #8 \n" // 8 processed per loop.
- "uxtl v0.8h, v0.8b \n" // b (0 .. 255)
- "uxtl v1.8h, v1.8b \n"
- "uxtl v2.8h, v2.8b \n"
- "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale
- "sqdmulh v1.8h, v1.8h, v4.8h \n" // g
- "sqdmulh v2.8h, v2.8h, v4.8h \n" // r
- "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size
- "mul v1.8h, v1.8h, v5.8h \n" // g
- "mul v2.8h, v2.8h, v5.8h \n" // r
- "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset
- "add v1.8h, v1.8h, v6.8h \n" // g
- "add v2.8h, v2.8h, v6.8h \n" // r
- "uqxtn v0.8b, v0.8h \n"
- "uqxtn v1.8b, v1.8h \n"
- "uqxtn v2.8b, v2.8h \n"
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w1, %w1, #8 \n" // 8 processed per loop.
+ "uxtl v0.8h, v0.8b \n" // b (0 .. 255)
+ "uxtl v1.8h, v1.8b \n"
+ "uxtl v2.8h, v2.8b \n"
+ "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale
+ "sqdmulh v1.8h, v1.8h, v4.8h \n" // g
+ "sqdmulh v2.8h, v2.8h, v4.8h \n" // r
+ "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size
+ "mul v1.8h, v1.8h, v5.8h \n" // g
+ "mul v2.8h, v2.8h, v5.8h \n" // r
+ "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset
+ "add v1.8h, v1.8h, v6.8h \n" // g
+ "add v2.8h, v2.8h, v6.8h \n" // r
+ "uqxtn v0.8b, v0.8h \n"
+ "uqxtn v1.8b, v1.8h \n"
+ "uqxtn v2.8b, v2.8h \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
: "r"(scale), // %2
@@ -2300,28 +2491,29 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
int width,
uint32_t value) {
asm volatile(
- "dup v0.4s, %w3 \n" // duplicate scale value.
- "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb.
- "ushr v0.8h, v0.8h, #1 \n" // scale / 2.
+ "dup v0.4s, %w3 \n" // duplicate scale value.
+ "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb.
+ "ushr v0.8h, v0.8h, #1 \n" // scale / 2.
// 8 pixel loop.
"1: \n"
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "uxtl v4.8h, v4.8b \n" // b (0 .. 255)
- "uxtl v5.8h, v5.8b \n"
- "uxtl v6.8h, v6.8b \n"
- "uxtl v7.8h, v7.8b \n"
- "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2
- "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g
- "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r
- "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a
- "uqxtn v4.8b, v4.8h \n"
- "uqxtn v5.8b, v5.8h \n"
- "uqxtn v6.8b, v6.8h \n"
- "uqxtn v7.8b, v7.8h \n"
- "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "uxtl v4.8h, v4.8b \n" // b (0 .. 255)
+ "uxtl v5.8h, v5.8b \n"
+ "uxtl v6.8h, v6.8b \n"
+ "uxtl v7.8h, v7.8b \n"
+ "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2
+ "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g
+ "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r
+ "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a
+ "uqxtn v4.8b, v4.8h \n"
+ "uqxtn v5.8b, v5.8h \n"
+ "uqxtn v6.8b, v6.8h \n"
+ "uqxtn v7.8b, v7.8h \n"
+ "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -2334,20 +2526,21 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
// C code is (29 * b + 150 * g + 77 * r + 128) >> 8;
void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
asm volatile(
- "movi v24.8b, #29 \n" // B * 0.1140 coefficient
- "movi v25.8b, #150 \n" // G * 0.5870 coefficient
- "movi v26.8b, #77 \n" // R * 0.2990 coefficient
+ "movi v24.8b, #29 \n" // B * 0.1140 coefficient
+ "movi v25.8b, #150 \n" // G * 0.5870 coefficient
+ "movi v26.8b, #77 \n" // R * 0.2990 coefficient
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v4.8h, v0.8b, v24.8b \n" // B
- "umlal v4.8h, v1.8b, v25.8b \n" // G
- "umlal v4.8h, v2.8b, v26.8b \n" // R
- "uqrshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit B
- "orr v1.8b, v0.8b, v0.8b \n" // G
- "orr v2.8b, v0.8b, v0.8b \n" // R
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels.
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v4.8h, v0.8b, v24.8b \n" // B
+ "umlal v4.8h, v1.8b, v25.8b \n" // G
+ "umlal v4.8h, v2.8b, v26.8b \n" // R
+ "uqrshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit B
+ "orr v1.8b, v0.8b, v0.8b \n" // G
+ "orr v2.8b, v0.8b, v0.8b \n" // R
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -2362,32 +2555,33 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
asm volatile(
- "movi v20.8b, #17 \n" // BB coefficient
- "movi v21.8b, #68 \n" // BG coefficient
- "movi v22.8b, #35 \n" // BR coefficient
- "movi v24.8b, #22 \n" // GB coefficient
- "movi v25.8b, #88 \n" // GG coefficient
- "movi v26.8b, #45 \n" // GR coefficient
- "movi v28.8b, #24 \n" // BB coefficient
- "movi v29.8b, #98 \n" // BG coefficient
- "movi v30.8b, #50 \n" // BR coefficient
+ "movi v20.8b, #17 \n" // BB coefficient
+ "movi v21.8b, #68 \n" // BG coefficient
+ "movi v22.8b, #35 \n" // BR coefficient
+ "movi v24.8b, #22 \n" // GB coefficient
+ "movi v25.8b, #88 \n" // GG coefficient
+ "movi v26.8b, #45 \n" // GR coefficient
+ "movi v28.8b, #24 \n" // BB coefficient
+ "movi v29.8b, #98 \n" // BG coefficient
+ "movi v30.8b, #50 \n" // BR coefficient
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels.
- "subs %w1, %w1, #8 \n" // 8 processed per loop.
- "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
- "umlal v4.8h, v1.8b, v21.8b \n" // G
- "umlal v4.8h, v2.8b, v22.8b \n" // R
- "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G
- "umlal v5.8h, v1.8b, v25.8b \n" // G
- "umlal v5.8h, v2.8b, v26.8b \n" // R
- "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R
- "umlal v6.8h, v1.8b, v29.8b \n" // G
- "umlal v6.8h, v2.8b, v30.8b \n" // R
- "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B
- "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G
- "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels.
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w1, %w1, #8 \n" // 8 processed per loop.
+ "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
+ "umlal v4.8h, v1.8b, v21.8b \n" // G
+ "umlal v4.8h, v2.8b, v22.8b \n" // R
+ "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G
+ "umlal v5.8h, v1.8b, v25.8b \n" // G
+ "umlal v5.8h, v2.8b, v26.8b \n" // R
+ "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R
+ "umlal v6.8h, v1.8b, v29.8b \n" // G
+ "umlal v6.8h, v2.8b, v30.8b \n" // R
+ "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B
+ "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G
+ "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels.
+ "b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
:
@@ -2403,51 +2597,52 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
const int8_t* matrix_argb,
int width) {
asm volatile(
- "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors.
- "sxtl v0.8h, v2.8b \n" // B,G coefficients s16.
- "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16.
+ "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors.
+ "sxtl v0.8h, v2.8b \n" // B,G coefficients s16.
+ "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16.
"1: \n"
- "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
- "uxtl v17.8h, v17.8b \n" // g
- "uxtl v18.8h, v18.8b \n" // r
- "uxtl v19.8h, v19.8b \n" // a
- "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B
- "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G
- "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R
- "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A
- "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B
- "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G
- "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R
- "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A
- "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
- "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
- "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
- "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
- "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B
- "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G
- "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R
- "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A
- "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
- "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
- "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
- "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
- "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B
- "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G
- "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R
- "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A
- "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
- "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
- "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
- "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
- "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B
- "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G
- "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R
- "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A
- "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
+ "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
+ "uxtl v17.8h, v17.8b \n" // g
+ "uxtl v18.8h, v18.8b \n" // r
+ "uxtl v19.8h, v19.8b \n" // a
+ "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B
+ "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G
+ "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R
+ "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A
+ "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B
+ "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G
+ "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R
+ "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A
+ "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
+ "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
+ "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
+ "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
+ "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B
+ "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G
+ "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R
+ "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A
+ "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
+ "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
+ "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
+ "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
+ "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B
+ "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G
+ "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R
+ "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A
+ "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
+ "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
+ "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
+ "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
+ "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B
+ "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G
+ "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R
+ "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A
+ "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -2465,19 +2660,21 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
asm volatile(
// 8 pixel loop.
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "umull v0.8h, v0.8b, v4.8b \n" // multiply B
- "umull v1.8h, v1.8b, v5.8b \n" // multiply G
- "umull v2.8h, v2.8b, v6.8b \n" // multiply R
- "umull v3.8h, v3.8b, v7.8b \n" // multiply A
- "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B
- "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G
- "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
- "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "umull v0.8h, v0.8b, v4.8b \n" // multiply B
+ "umull v1.8h, v1.8b, v5.8b \n" // multiply G
+ "umull v2.8h, v2.8b, v6.8b \n" // multiply R
+ "umull v3.8h, v3.8b, v7.8b \n" // multiply A
+ "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B
+ "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G
+ "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
+ "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
@@ -2494,15 +2691,17 @@ void ARGBAddRow_NEON(const uint8_t* src_argb0,
asm volatile(
// 8 pixel loop.
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "uqadd v0.8b, v0.8b, v4.8b \n"
- "uqadd v1.8b, v1.8b, v5.8b \n"
- "uqadd v2.8b, v2.8b, v6.8b \n"
- "uqadd v3.8b, v3.8b, v7.8b \n"
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uqadd v0.8b, v0.8b, v4.8b \n"
+ "uqadd v1.8b, v1.8b, v5.8b \n"
+ "uqadd v2.8b, v2.8b, v6.8b \n"
+ "uqadd v3.8b, v3.8b, v7.8b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
@@ -2519,15 +2718,17 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
asm volatile(
// 8 pixel loop.
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "uqsub v0.8b, v0.8b, v4.8b \n"
- "uqsub v1.8b, v1.8b, v5.8b \n"
- "uqsub v2.8b, v2.8b, v6.8b \n"
- "uqsub v3.8b, v3.8b, v7.8b \n"
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uqsub v0.8b, v0.8b, v4.8b \n"
+ "uqsub v1.8b, v1.8b, v5.8b \n"
+ "uqsub v2.8b, v2.8b, v6.8b \n"
+ "uqsub v3.8b, v3.8b, v7.8b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
@@ -2546,17 +2747,19 @@ void SobelRow_NEON(const uint8_t* src_sobelx,
uint8_t* dst_argb,
int width) {
asm volatile(
- "movi v3.8b, #255 \n" // alpha
+ "movi v3.8b, #255 \n" // alpha
// 8 pixel loop.
"1: \n"
- "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
- "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "uqadd v0.8b, v0.8b, v1.8b \n" // add
- "orr v1.8b, v0.8b, v0.8b \n"
- "orr v2.8b, v0.8b, v0.8b \n"
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
+ "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
+ "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uqadd v0.8b, v0.8b, v1.8b \n" // add
+ "orr v1.8b, v0.8b, v0.8b \n"
+ "orr v2.8b, v0.8b, v0.8b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_argb), // %2
@@ -2573,12 +2776,14 @@ void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
asm volatile(
// 16 pixel loop.
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
- "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
- "subs %w3, %w3, #16 \n" // 16 processed per loop.
- "uqadd v0.16b, v0.16b, v1.16b \n" // add
- "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
- "b.gt 1b \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
+ "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #16 \n" // 16 processed per loop.
+ "uqadd v0.16b, v0.16b, v1.16b \n" // add
+ "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
+ "b.gt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_y), // %2
@@ -2597,15 +2802,17 @@ void SobelXYRow_NEON(const uint8_t* src_sobelx,
uint8_t* dst_argb,
int width) {
asm volatile(
- "movi v3.8b, #255 \n" // alpha
+ "movi v3.8b, #255 \n" // alpha
// 8 pixel loop.
"1: \n"
- "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
- "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "uqadd v1.8b, v0.8b, v2.8b \n" // add
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
+ "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
+ "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uqadd v1.8b, v0.8b, v2.8b \n" // add
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_argb), // %2
@@ -2625,23 +2832,26 @@ void SobelXRow_NEON(const uint8_t* src_y0,
int width) {
asm volatile(
"1: \n"
- "ld1 {v0.8b}, [%0],%5 \n" // top
- "ld1 {v1.8b}, [%0],%6 \n"
- "usubl v0.8h, v0.8b, v1.8b \n"
- "ld1 {v2.8b}, [%1],%5 \n" // center * 2
- "ld1 {v3.8b}, [%1],%6 \n"
- "usubl v1.8h, v2.8b, v3.8b \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "ld1 {v2.8b}, [%2],%5 \n" // bottom
- "ld1 {v3.8b}, [%2],%6 \n"
- "subs %w4, %w4, #8 \n" // 8 pixels
- "usubl v1.8h, v2.8b, v3.8b \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "abs v0.8h, v0.8h \n"
- "uqxtn v0.8b, v0.8h \n"
- "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx
- "b.gt 1b \n"
+ "ld1 {v0.8b}, [%0],%5 \n" // top
+ "ld1 {v1.8b}, [%0],%6 \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "usubl v0.8h, v0.8b, v1.8b \n"
+ "ld1 {v2.8b}, [%1],%5 \n" // center * 2
+ "ld1 {v3.8b}, [%1],%6 \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "ld1 {v2.8b}, [%2],%5 \n" // bottom
+ "ld1 {v3.8b}, [%2],%6 \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "subs %w4, %w4, #8 \n" // 8 pixels
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "abs v0.8h, v0.8h \n"
+ "uqxtn v0.8b, v0.8h \n"
+ "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx
+ "b.gt 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
"+r"(src_y2), // %2
@@ -2663,23 +2873,25 @@ void SobelYRow_NEON(const uint8_t* src_y0,
int width) {
asm volatile(
"1: \n"
- "ld1 {v0.8b}, [%0],%4 \n" // left
- "ld1 {v1.8b}, [%1],%4 \n"
- "usubl v0.8h, v0.8b, v1.8b \n"
- "ld1 {v2.8b}, [%0],%4 \n" // center * 2
- "ld1 {v3.8b}, [%1],%4 \n"
- "usubl v1.8h, v2.8b, v3.8b \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "ld1 {v2.8b}, [%0],%5 \n" // right
- "ld1 {v3.8b}, [%1],%5 \n"
- "subs %w3, %w3, #8 \n" // 8 pixels
- "usubl v1.8h, v2.8b, v3.8b \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "abs v0.8h, v0.8h \n"
- "uqxtn v0.8b, v0.8h \n"
- "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely
- "b.gt 1b \n"
+ "ld1 {v0.8b}, [%0],%4 \n" // left
+ "ld1 {v1.8b}, [%1],%4 \n"
+ "usubl v0.8h, v0.8b, v1.8b \n"
+ "ld1 {v2.8b}, [%0],%4 \n" // center * 2
+ "ld1 {v3.8b}, [%1],%4 \n"
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "ld1 {v2.8b}, [%0],%5 \n" // right
+ "ld1 {v3.8b}, [%1],%5 \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #8 \n" // 8 pixels
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "abs v0.8h, v0.8h \n"
+ "uqxtn v0.8b, v0.8h \n"
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely
+ "b.gt 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
"+r"(dst_sobely), // %2
@@ -2697,16 +2909,17 @@ void HalfFloat1Row_NEON(const uint16_t* src,
int width) {
asm volatile(
"1: \n"
- "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
- "subs %w2, %w2, #8 \n" // 8 pixels per loop
- "uxtl v2.4s, v1.4h \n" // 8 int's
- "uxtl2 v3.4s, v1.8h \n"
- "scvtf v2.4s, v2.4s \n" // 8 floats
- "scvtf v3.4s, v3.4s \n"
- "fcvtn v1.4h, v2.4s \n" // 8 half floats
- "fcvtn2 v1.8h, v3.4s \n"
- "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
- "b.gt 1b \n"
+ "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 pixels per loop
+ "uxtl v2.4s, v1.4h \n" // 8 int's
+ "uxtl2 v3.4s, v1.8h \n"
+ "scvtf v2.4s, v2.4s \n" // 8 floats
+ "scvtf v3.4s, v3.4s \n"
+ "fcvtn v1.4h, v2.4s \n" // 8 half floats
+ "fcvtn2 v1.8h, v3.4s \n"
+ "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
+ "b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -2720,18 +2933,19 @@ void HalfFloatRow_NEON(const uint16_t* src,
int width) {
asm volatile(
"1: \n"
- "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
- "subs %w2, %w2, #8 \n" // 8 pixels per loop
- "uxtl v2.4s, v1.4h \n" // 8 int's
- "uxtl2 v3.4s, v1.8h \n"
- "scvtf v2.4s, v2.4s \n" // 8 floats
- "scvtf v3.4s, v3.4s \n"
- "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent
- "fmul v3.4s, v3.4s, %3.s[0] \n"
- "uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat
- "uqshrn2 v1.8h, v3.4s, #13 \n"
- "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
- "b.gt 1b \n"
+ "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 pixels per loop
+ "uxtl v2.4s, v1.4h \n" // 8 int's
+ "uxtl2 v3.4s, v1.8h \n"
+ "scvtf v2.4s, v2.4s \n" // 8 floats
+ "scvtf v3.4s, v3.4s \n"
+ "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent
+ "fmul v3.4s, v3.4s, %3.s[0] \n"
+ "uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat
+ "uqshrn2 v1.8h, v3.4s, #13 \n"
+ "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
+ "b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -2745,17 +2959,18 @@ void ByteToFloatRow_NEON(const uint8_t* src,
int width) {
asm volatile(
"1: \n"
- "ld1 {v1.8b}, [%0], #8 \n" // load 8 bytes
- "subs %w2, %w2, #8 \n" // 8 pixels per loop
- "uxtl v1.8h, v1.8b \n" // 8 shorts
- "uxtl v2.4s, v1.4h \n" // 8 ints
- "uxtl2 v3.4s, v1.8h \n"
- "scvtf v2.4s, v2.4s \n" // 8 floats
- "scvtf v3.4s, v3.4s \n"
- "fmul v2.4s, v2.4s, %3.s[0] \n" // scale
- "fmul v3.4s, v3.4s, %3.s[0] \n"
- "st1 {v2.16b, v3.16b}, [%1], #32 \n" // store 8 floats
- "b.gt 1b \n"
+ "ld1 {v1.8b}, [%0], #8 \n" // load 8 bytes
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 pixels per loop
+ "uxtl v1.8h, v1.8b \n" // 8 shorts
+ "uxtl v2.4s, v1.4h \n" // 8 ints
+ "uxtl2 v3.4s, v1.8h \n"
+ "scvtf v2.4s, v2.4s \n" // 8 floats
+ "scvtf v3.4s, v3.4s \n"
+ "fmul v2.4s, v2.4s, %3.s[0] \n" // scale
+ "fmul v3.4s, v3.4s, %3.s[0] \n"
+ "st1 {v2.16b, v3.16b}, [%1], #32 \n" // store 8 floats
+ "b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -2769,20 +2984,21 @@ float ScaleMaxSamples_NEON(const float* src,
int width) {
float fmax;
asm volatile(
- "movi v5.4s, #0 \n" // max
- "movi v6.4s, #0 \n"
+ "movi v5.4s, #0 \n" // max
+ "movi v6.4s, #0 \n"
"1: \n"
- "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
- "subs %w2, %w2, #8 \n" // 8 processed per loop
- "fmul v3.4s, v1.4s, %4.s[0] \n" // scale
- "fmul v4.4s, v2.4s, %4.s[0] \n" // scale
- "fmax v5.4s, v5.4s, v1.4s \n" // max
- "fmax v6.4s, v6.4s, v2.4s \n"
- "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
- "b.gt 1b \n"
- "fmax v5.4s, v5.4s, v6.4s \n" // max
- "fmaxv %s3, v5.4s \n" // signed max acculator
+ "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "fmul v3.4s, v1.4s, %4.s[0] \n" // scale
+ "fmul v4.4s, v2.4s, %4.s[0] \n" // scale
+ "fmax v5.4s, v5.4s, v1.4s \n" // max
+ "fmax v6.4s, v6.4s, v2.4s \n"
+ "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
+ "b.gt 1b \n"
+ "fmax v5.4s, v5.4s, v6.4s \n" // max
+ "fmaxv %s3, v5.4s \n" // signed max acculator
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width), // %2
@@ -2798,21 +3014,22 @@ float ScaleSumSamples_NEON(const float* src,
int width) {
float fsum;
asm volatile(
- "movi v5.4s, #0 \n" // max
- "movi v6.4s, #0 \n" // max
+ "movi v5.4s, #0 \n" // max
+ "movi v6.4s, #0 \n" // max
"1: \n"
- "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
- "subs %w2, %w2, #8 \n" // 8 processed per loop
- "fmul v3.4s, v1.4s, %4.s[0] \n" // scale
- "fmul v4.4s, v2.4s, %4.s[0] \n"
- "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares
- "fmla v6.4s, v2.4s, v2.4s \n"
- "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
- "b.gt 1b \n"
- "faddp v5.4s, v5.4s, v6.4s \n"
- "faddp v5.4s, v5.4s, v5.4s \n"
- "faddp %3.4s, v5.4s, v5.4s \n" // sum
+ "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "fmul v3.4s, v1.4s, %4.s[0] \n" // scale
+ "fmul v4.4s, v2.4s, %4.s[0] \n"
+ "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares
+ "fmla v6.4s, v2.4s, v2.4s \n"
+ "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
+ "b.gt 1b \n"
+ "faddp v5.4s, v5.4s, v6.4s \n"
+ "faddp v5.4s, v5.4s, v5.4s \n"
+ "faddp %3.4s, v5.4s, v5.4s \n" // sum
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width), // %2
@@ -2825,12 +3042,13 @@ float ScaleSumSamples_NEON(const float* src,
void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
asm volatile(
"1: \n"
- "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
- "subs %w2, %w2, #8 \n" // 8 processed per loop
- "fmul v1.4s, v1.4s, %3.s[0] \n" // scale
- "fmul v2.4s, v2.4s, %3.s[0] \n" // scale
- "st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples
- "b.gt 1b \n"
+ "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "fmul v1.4s, v1.4s, %3.s[0] \n" // scale
+ "fmul v2.4s, v2.4s, %3.s[0] \n" // scale
+ "st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples
+ "b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -2847,26 +3065,31 @@ void GaussCol_NEON(const uint16_t* src0,
uint32_t* dst,
int width) {
asm volatile(
- "movi v6.8h, #4 \n" // constant 4
- "movi v7.8h, #6 \n" // constant 6
+ "movi v6.8h, #4 \n" // constant 4
+ "movi v7.8h, #6 \n" // constant 6
"1: \n"
- "ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows
- "ld1 {v2.8h}, [%4], #16 \n"
- "uaddl v0.4s, v1.4h, v2.4h \n" // * 1
- "uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1
- "ld1 {v2.8h}, [%1], #16 \n"
- "umlal v0.4s, v2.4h, v6.4h \n" // * 4
- "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
- "ld1 {v2.8h}, [%2], #16 \n"
- "umlal v0.4s, v2.4h, v7.4h \n" // * 6
- "umlal2 v1.4s, v2.8h, v7.8h \n" // * 6
- "ld1 {v2.8h}, [%3], #16 \n"
- "umlal v0.4s, v2.4h, v6.4h \n" // * 4
- "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
- "subs %w6, %w6, #8 \n" // 8 processed per loop
- "st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples
- "b.gt 1b \n"
+ "ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows
+ "ld1 {v2.8h}, [%4], #16 \n"
+ "uaddl v0.4s, v1.4h, v2.4h \n" // * 1
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1
+ "ld1 {v2.8h}, [%1], #16 \n"
+ "umlal v0.4s, v2.4h, v6.4h \n" // * 4
+ "prfm pldl1keep, [%1, 448] \n"
+ "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
+ "ld1 {v2.8h}, [%2], #16 \n"
+ "umlal v0.4s, v2.4h, v7.4h \n" // * 6
+ "prfm pldl1keep, [%2, 448] \n"
+ "umlal2 v1.4s, v2.8h, v7.8h \n" // * 6
+ "ld1 {v2.8h}, [%3], #16 \n"
+ "umlal v0.4s, v2.4h, v6.4h \n" // * 4
+ "prfm pldl1keep, [%3, 448] \n"
+ "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
+ "subs %w6, %w6, #8 \n" // 8 processed per loop
+ "st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples
+ "prfm pldl1keep, [%4, 448] \n"
+ "b.gt 1b \n"
: "+r"(src0), // %0
"+r"(src1), // %1
"+r"(src2), // %2
@@ -2884,27 +3107,28 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
const uint32_t* src2 = src + 2;
const uint32_t* src3 = src + 3;
asm volatile(
- "movi v6.4s, #4 \n" // constant 4
- "movi v7.4s, #6 \n" // constant 6
+ "movi v6.4s, #4 \n" // constant 4
+ "movi v7.4s, #6 \n" // constant 6
"1: \n"
- "ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n" // load 12 source samples
- "add v0.4s, v0.4s, v1.4s \n" // * 1
- "add v1.4s, v1.4s, v2.4s \n" // * 1
- "ld1 {v2.4s,v3.4s}, [%2], #32 \n"
- "mla v0.4s, v2.4s, v7.4s \n" // * 6
- "mla v1.4s, v3.4s, v7.4s \n" // * 6
- "ld1 {v2.4s,v3.4s}, [%1], #32 \n"
- "ld1 {v4.4s,v5.4s}, [%3], #32 \n"
- "add v2.4s, v2.4s, v4.4s \n" // add rows for * 4
- "add v3.4s, v3.4s, v5.4s \n"
- "mla v0.4s, v2.4s, v6.4s \n" // * 4
- "mla v1.4s, v3.4s, v6.4s \n" // * 4
- "subs %w5, %w5, #8 \n" // 8 processed per loop
- "uqrshrn v0.4h, v0.4s, #8 \n" // round and pack
- "uqrshrn2 v0.8h, v1.4s, #8 \n"
- "st1 {v0.8h}, [%4], #16 \n" // store 8 samples
- "b.gt 1b \n"
+ "ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n" // load 12 source samples
+ "add v0.4s, v0.4s, v1.4s \n" // * 1
+ "add v1.4s, v1.4s, v2.4s \n" // * 1
+ "ld1 {v2.4s,v3.4s}, [%2], #32 \n"
+ "mla v0.4s, v2.4s, v7.4s \n" // * 6
+ "mla v1.4s, v3.4s, v7.4s \n" // * 6
+ "ld1 {v2.4s,v3.4s}, [%1], #32 \n"
+ "ld1 {v4.4s,v5.4s}, [%3], #32 \n"
+ "add v2.4s, v2.4s, v4.4s \n" // add rows for * 4
+ "add v3.4s, v3.4s, v5.4s \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "mla v0.4s, v2.4s, v6.4s \n" // * 4
+ "mla v1.4s, v3.4s, v6.4s \n" // * 4
+ "subs %w5, %w5, #8 \n" // 8 processed per loop
+ "uqrshrn v0.4h, v0.4s, #8 \n" // round and pack
+ "uqrshrn2 v0.8h, v1.4s, #8 \n"
+ "st1 {v0.8h}, [%4], #16 \n" // store 8 samples
+ "b.gt 1b \n"
: "+r"(src), // %0
"+r"(src1), // %1
"+r"(src2), // %2
@@ -2915,6 +3139,87 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
}
+static const vecf32 kGaussCoefficients = {4.0f, 6.0f, 1.0f / 256.0f, 0.0f};
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_F32_NEON(const float* src0,
+ const float* src1,
+ const float* src2,
+ const float* src3,
+ const float* src4,
+ float* dst,
+ int width) {
+ asm volatile(
+ "ld2r {v6.4s, v7.4s}, [%7] \n" // constants 4 and 6
+
+ "1: \n"
+ "ld1 {v0.4s, v1.4s}, [%0], #32 \n" // load 8 samples, 5 rows
+ "ld1 {v2.4s, v3.4s}, [%1], #32 \n"
+ "fmla v0.4s, v2.4s, v6.4s \n" // * 4
+ "ld1 {v4.4s, v5.4s}, [%2], #32 \n"
+ "fmla v1.4s, v3.4s, v6.4s \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "fmla v0.4s, v4.4s, v7.4s \n" // * 6
+ "ld1 {v2.4s, v3.4s}, [%3], #32 \n"
+ "fmla v1.4s, v5.4s, v7.4s \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "fmla v0.4s, v2.4s, v6.4s \n" // * 4
+ "ld1 {v4.4s, v5.4s}, [%4], #32 \n"
+ "fmla v1.4s, v3.4s, v6.4s \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "fadd v0.4s, v0.4s, v4.4s \n" // * 1
+ "prfm pldl1keep, [%3, 448] \n"
+ "fadd v1.4s, v1.4s, v5.4s \n"
+ "prfm pldl1keep, [%4, 448] \n"
+ "subs %w6, %w6, #8 \n" // 8 processed per loop
+ "st1 {v0.4s, v1.4s}, [%5], #32 \n" // store 8 samples
+ "b.gt 1b \n"
+ : "+r"(src0), // %0
+ "+r"(src1), // %1
+ "+r"(src2), // %2
+ "+r"(src3), // %3
+ "+r"(src4), // %4
+ "+r"(dst), // %5
+ "+r"(width) // %6
+ : "r"(&kGaussCoefficients) // %7
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussRow_F32_NEON(const float* src, float* dst, int width) {
+ asm volatile(
+ "ld3r {v6.4s, v7.4s, v8.4s}, [%3] \n" // constants 4, 6, 1/256
+
+ "1: \n"
+ "ld1 {v0.4s, v1.4s, v2.4s}, [%0], %4 \n" // load 12 samples, 5
+ // rows
+ "fadd v0.4s, v0.4s, v1.4s \n" // * 1
+ "ld1 {v4.4s, v5.4s}, [%0], %5 \n"
+ "fadd v1.4s, v1.4s, v2.4s \n"
+ "fmla v0.4s, v4.4s, v7.4s \n" // * 6
+ "ld1 {v2.4s, v3.4s}, [%0], %4 \n"
+ "fmla v1.4s, v5.4s, v7.4s \n"
+ "ld1 {v4.4s, v5.4s}, [%0], %6 \n"
+ "fadd v2.4s, v2.4s, v4.4s \n"
+ "fadd v3.4s, v3.4s, v5.4s \n"
+ "fmla v0.4s, v2.4s, v6.4s \n" // * 4
+ "fmla v1.4s, v3.4s, v6.4s \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "fmul v0.4s, v0.4s, v8.4s \n" // / 256
+ "fmul v1.4s, v1.4s, v8.4s \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "st1 {v0.4s, v1.4s}, [%1], #32 \n" // store 8 samples
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"(&kGaussCoefficients), // %3
+ "r"(8LL), // %4
+ "r"(-4LL), // %5
+ "r"(20LL) // %6
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8");
+}
+
// Convert biplanar NV21 to packed YUV24
void NV21ToYUV24Row_NEON(const uint8_t* src_y,
const uint8_t* src_vu,
@@ -2922,13 +3227,15 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
int width) {
asm volatile(
"1: \n"
- "ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values
- "ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values
- "zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values
- "zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values
- "subs %w3, %w3, #16 \n" // 16 pixels per loop
- "st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels
- "b.gt 1b \n"
+ "ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values
+ "ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values
+ "zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values
+ "subs %w3, %w3, #16 \n" // 16 pixels per loop
+ "st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_vu), // %1
"+r"(dst_yuv24), // %2
@@ -2945,17 +3252,19 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
asm volatile(
"1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv
- "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
- "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
- "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts.
- "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts.
- "uqrshrn v3.8b, v0.8h, #2 \n" // 2x2 average
- "uqrshrn v2.8b, v1.8h, #2 \n"
- "subs %w3, %w3, #16 \n" // 16 processed per loop.
- "st2 {v2.8b,v3.8b}, [%2], #16 \n" // store 8 pixels UV.
- "b.gt 1b \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts.
+ "uqrshrn v3.8b, v0.8h, #2 \n" // 2x2 average
+ "uqrshrn v2.8b, v1.8h, #2 \n"
+ "subs %w3, %w3, #16 \n" // 16 processed per loop.
+ "st2 {v2.8b,v3.8b}, [%2], #16 \n" // store 8 pixels UV.
+ "b.gt 1b \n"
: "+r"(src_ayuv), // %0
"+r"(src_ayuv_1), // %1
"+r"(dst_uv), // %2
@@ -2972,18 +3281,19 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
asm volatile(
"1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
- // pixels.
- "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
- "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
- "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts.
- "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts.
- "uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average
- "uqrshrn v1.8b, v1.8h, #2 \n"
- "subs %w3, %w3, #16 \n" // 16 processed per loop.
- "st2 {v0.8b,v1.8b}, [%2], #16 \n" // store 8 pixels VU.
- "b.gt 1b \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts.
+ "uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average
+ "uqrshrn v1.8b, v1.8h, #2 \n"
+ "subs %w3, %w3, #16 \n" // 16 processed per loop.
+ "st2 {v0.8b,v1.8b}, [%2], #16 \n" // store 8 pixels VU.
+ "b.gt 1b \n"
: "+r"(src_ayuv), // %0
"+r"(src_ayuv_1), // %1
"+r"(dst_vu), // %2
@@ -2996,11 +3306,11 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
asm volatile(
"1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
- // pixels
- "subs %w2, %w2, #16 \n" // 16 pixels per loop
- "st1 {v2.16b}, [%1], #16 \n" // store 16 Y pixels
- "b.gt 1b \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #16 \n" // 16 pixels per loop
+ "st1 {v2.16b}, [%1], #16 \n" // store 16 Y pixels
+ "b.gt 1b \n"
: "+r"(src_ayuv), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -3008,22 +3318,67 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
: "cc", "memory", "v0", "v1", "v2", "v3");
}
+// Shuffle table for swapping UV bytes.
+static const uvec8 kShuffleSwapUV = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u,
+ 9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
+
// Convert UV plane of NV12 to VU of NV21.
void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
asm volatile(
+ "ld1 {v2.16b}, [%3] \n" // shuffler
"1: \n"
- "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 UV values
- "orr v2.16b, v0.16b, v0.16b \n" // move U after V
- "subs %w2, %w2, #16 \n" // 16 pixels per loop
- "st2 {v1.16b, v2.16b}, [%1], #32 \n" // store 16 VU pixels
- "b.gt 1b \n"
- : "+r"(src_uv), // %0
- "+r"(dst_vu), // %1
- "+r"(width) // %2
- :
+ "ld1 {v0.16b}, [%0], 16 \n" // load 16 UV values
+ "ld1 {v1.16b}, [%0], 16 \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #16 \n" // 16 pixels per loop
+ "tbl v0.16b, {v0.16b}, v2.16b \n"
+ "tbl v1.16b, {v1.16b}, v2.16b \n"
+ "stp q0, q1, [%1], 32 \n" // store 16 VU pixels
+ "b.gt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_vu), // %1
+ "+r"(width) // %2
+ : "r"(&kShuffleSwapUV) // %3
: "cc", "memory", "v0", "v1", "v2");
}
+void HalfMergeUVRow_NEON(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width) {
+ const uint8_t* src_u_1 = src_u + src_stride_u;
+ const uint8_t* src_v_1 = src_v + src_stride_v;
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 16 U values
+ "ld1 {v1.16b}, [%2], #16 \n" // load 16 V values
+ "ld1 {v2.16b}, [%1], #16 \n"
+ "ld1 {v3.16b}, [%3], #16 \n"
+ "uaddlp v0.8h, v0.16b \n" // half size
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v1.8h, v1.16b \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "uadalp v0.8h, v2.16b \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v1.8h, v3.16b \n"
+ "prfm pldl1keep, [%3, 448] \n"
+ "uqrshrn v0.8b, v0.8h, #2 \n"
+ "uqrshrn v1.8b, v1.8h, #2 \n"
+ "subs %w5, %w5, #16 \n" // 16 src pixels per loop
+ "st2 {v0.8b, v1.8b}, [%4], #16 \n" // store 8 UV pixels
+ "b.gt 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_u_1), // %1
+ "+r"(src_v), // %2
+ "+r"(src_v_1), // %3
+ "+r"(dst_uv), // %4
+ "+r"(width) // %5
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#ifdef __cplusplus
diff --git a/TMessagesProj/jni/third_party/libyuv/source/row_win.cc b/TMessagesProj/jni/third_party/libyuv/source/row_win.cc
index f976d4026..9afcf060a 100644
--- a/TMessagesProj/jni/third_party/libyuv/source/row_win.cc
+++ b/TMessagesProj/jni/third_party/libyuv/source/row_win.cc
@@ -2898,10 +2898,12 @@ __declspec(naked) void I422ToRGBARow_SSSE3(
}
#endif // HAS_I422TOARGBROW_SSSE3
+// I400ToARGBRow_SSE2 is disabled due to new yuvconstant parameter
#ifdef HAS_I400TOARGBROW_SSE2
// 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
__declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf,
uint8_t* rgb_buf,
+ const struct YuvConstants*,
int width) {
__asm {
mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
@@ -2949,6 +2951,7 @@ __declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf,
// note: vpunpcklbw mutates and vpackuswb unmutates.
__declspec(naked) void I400ToARGBRow_AVX2(const uint8_t* y_buf,
uint8_t* rgb_buf,
+ const struct YuvConstants*,
int width) {
__asm {
mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
@@ -3045,15 +3048,15 @@ __declspec(naked) void MirrorRow_AVX2(const uint8_t* src,
}
#endif // HAS_MIRRORROW_AVX2
-#ifdef HAS_MIRRORUVROW_SSSE3
+#ifdef HAS_MIRRORSPLITUVROW_SSSE3
// Shuffle table for reversing the bytes of UV channels.
static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
-__declspec(naked) void MirrorUVRow_SSSE3(const uint8_t* src,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
+__declspec(naked) void MirrorSplitUVRow_SSSE3(const uint8_t* src,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
__asm {
push edi
mov eax, [esp + 4 + 4] // src
@@ -3078,7 +3081,7 @@ __declspec(naked) void MirrorUVRow_SSSE3(const uint8_t* src,
ret
}
}
-#endif // HAS_MIRRORUVROW_SSSE3
+#endif // HAS_MIRRORSPLITUVROW_SSSE3
#ifdef HAS_ARGBMIRRORROW_SSE2
__declspec(naked) void ARGBMirrorRow_SSE2(const uint8_t* src,
diff --git a/TMessagesProj/jni/third_party/libyuv/source/scale.cc b/TMessagesProj/jni/third_party/libyuv/source/scale.cc
index 5034c5032..cf3c03325 100644
--- a/TMessagesProj/jni/third_party/libyuv/source/scale.cc
+++ b/TMessagesProj/jni/third_party/libyuv/source/scale.cc
@@ -17,6 +17,7 @@
#include "libyuv/planar_functions.h" // For CopyPlane
#include "libyuv/row.h"
#include "libyuv/scale_row.h"
+#include "libyuv/scale_uv.h" // For UVScale
#ifdef __cplusplus
namespace libyuv {
@@ -103,21 +104,6 @@ static void ScalePlaneDown2(int src_width,
}
}
#endif
-#if defined(HAS_SCALEROWDOWN2_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ScaleRowDown2 =
- filtering == kFilterNone
- ? ScaleRowDown2_Any_MSA
- : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_MSA
- : ScaleRowDown2Box_Any_MSA);
- if (IS_ALIGNED(dst_width, 32)) {
- ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_MSA
- : (filtering == kFilterLinear
- ? ScaleRowDown2Linear_MSA
- : ScaleRowDown2Box_MSA);
- }
- }
-#endif
#if defined(HAS_SCALEROWDOWN2_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ScaleRowDown2 =
@@ -133,6 +119,21 @@ static void ScalePlaneDown2(int src_width,
}
}
#endif
+#if defined(HAS_SCALEROWDOWN2_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleRowDown2 =
+ filtering == kFilterNone
+ ? ScaleRowDown2_Any_MSA
+ : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_MSA
+ : ScaleRowDown2Box_Any_MSA);
+ if (IS_ALIGNED(dst_width, 32)) {
+ ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_MSA
+ : (filtering == kFilterLinear
+ ? ScaleRowDown2Linear_MSA
+ : ScaleRowDown2Box_MSA);
+ }
+ }
+#endif
if (filtering == kFilterLinear) {
src_stride = 0;
@@ -255,15 +256,6 @@ static void ScalePlaneDown4(int src_width,
}
}
#endif
-#if defined(HAS_SCALEROWDOWN4_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ScaleRowDown4 =
- filtering ? ScaleRowDown4Box_Any_MSA : ScaleRowDown4_Any_MSA;
- if (IS_ALIGNED(dst_width, 16)) {
- ScaleRowDown4 = filtering ? ScaleRowDown4Box_MSA : ScaleRowDown4_MSA;
- }
- }
-#endif
#if defined(HAS_SCALEROWDOWN4_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ScaleRowDown4 =
@@ -273,6 +265,15 @@ static void ScalePlaneDown4(int src_width,
}
}
#endif
+#if defined(HAS_SCALEROWDOWN4_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleRowDown4 =
+ filtering ? ScaleRowDown4Box_Any_MSA : ScaleRowDown4_Any_MSA;
+ if (IS_ALIGNED(dst_width, 16)) {
+ ScaleRowDown4 = filtering ? ScaleRowDown4Box_MSA : ScaleRowDown4_MSA;
+ }
+ }
+#endif
if (filtering == kFilterLinear) {
src_stride = 0;
@@ -378,6 +379,18 @@ static void ScalePlaneDown34(int src_width,
}
}
#endif
+#if defined(HAS_SCALEROWDOWN34_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_Any_MMI;
+ ScaleRowDown34_1 = ScaleRowDown34_Any_MMI;
+ if (dst_width % 24 == 0) {
+ ScaleRowDown34_0 = ScaleRowDown34_MMI;
+ ScaleRowDown34_1 = ScaleRowDown34_MMI;
+ }
+ }
+ }
+#endif
#if defined(HAS_SCALEROWDOWN34_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
if (!filtering) {
@@ -398,18 +411,6 @@ static void ScalePlaneDown34(int src_width,
}
}
#endif
-#if defined(HAS_SCALEROWDOWN34_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- if (!filtering) {
- ScaleRowDown34_0 = ScaleRowDown34_Any_MMI;
- ScaleRowDown34_1 = ScaleRowDown34_Any_MMI;
- if (dst_width % 24 == 0) {
- ScaleRowDown34_0 = ScaleRowDown34_MMI;
- ScaleRowDown34_1 = ScaleRowDown34_MMI;
- }
- }
- }
-#endif
#if defined(HAS_SCALEROWDOWN34_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
if (!filtering) {
@@ -890,14 +891,6 @@ static void ScalePlaneBox(int src_width,
}
}
#endif
-#if defined(HAS_SCALEADDROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ScaleAddRow = ScaleAddRow_Any_MSA;
- if (IS_ALIGNED(src_width, 16)) {
- ScaleAddRow = ScaleAddRow_MSA;
- }
- }
-#endif
#if defined(HAS_SCALEADDROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ScaleAddRow = ScaleAddRow_Any_MMI;
@@ -906,6 +899,14 @@ static void ScalePlaneBox(int src_width,
}
}
#endif
+#if defined(HAS_SCALEADDROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleAddRow = ScaleAddRow_Any_MSA;
+ if (IS_ALIGNED(src_width, 16)) {
+ ScaleAddRow = ScaleAddRow_MSA;
+ }
+ }
+#endif
for (j = 0; j < dst_height; ++j) {
int boxheight;
@@ -1042,14 +1043,6 @@ void ScalePlaneBilinearDown(int src_width,
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- InterpolateRow = InterpolateRow_Any_MSA;
- if (IS_ALIGNED(src_width, 32)) {
- InterpolateRow = InterpolateRow_MSA;
- }
- }
-#endif
#if defined(HAS_INTERPOLATEROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
InterpolateRow = InterpolateRow_Any_MMI;
@@ -1058,6 +1051,14 @@ void ScalePlaneBilinearDown(int src_width,
}
}
#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(src_width, 32)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
#if defined(HAS_SCALEFILTERCOLS_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
@@ -1670,7 +1671,7 @@ void ScalePlane_16(const uint16_t* src,
}
if (dst_width == src_width && filtering != kFilterBox) {
int dy = FixedDiv(src_height, dst_height);
- // Arbitrary scale vertically, but unscaled vertically.
+ // Arbitrary scale vertically, but unscaled horizontally.
ScalePlaneVertical_16(src_height, dst_width, dst_height, src_stride,
dst_stride, src, dst, 0, 0, dy, 1, filtering);
return;
@@ -1869,6 +1870,40 @@ int I444Scale_16(const uint16_t* src_y,
return 0;
}
+// Scale an NV12 image.
+// This function in turn calls a scaling function for each plane.
+
+LIBYUV_API
+int NV12Scale(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ int src_width,
+ int src_height,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+ int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+ int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+ int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+ if (!src_y || !src_uv || src_width == 0 || src_height == 0 ||
+ src_width > 32768 || src_height > 32768 || !dst_y || !dst_uv ||
+ dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+
+ ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
+ dst_width, dst_height, filtering);
+ UVScale(src_uv, src_stride_uv, src_halfwidth, src_halfheight, dst_uv,
+ dst_stride_uv, dst_halfwidth, dst_halfheight, filtering);
+ return 0;
+}
+
// Deprecated api
LIBYUV_API
int Scale(const uint8_t* src_y,
diff --git a/TMessagesProj/jni/third_party/libyuv/source/scale_any.cc b/TMessagesProj/jni/third_party/libyuv/source/scale_any.cc
index d780cb1ff..c93d70c5f 100644
--- a/TMessagesProj/jni/third_party/libyuv/source/scale_any.cc
+++ b/TMessagesProj/jni/third_party/libyuv/source/scale_any.cc
@@ -20,49 +20,6 @@ namespace libyuv {
extern "C" {
#endif
-// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
-#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \
- void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, \
- int dx) { \
- int r = dst_width & MASK; \
- int n = dst_width & ~MASK; \
- if (n > 0) { \
- TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \
- } \
- TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx); \
- }
-
-#ifdef HAS_SCALEFILTERCOLS_NEON
-CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
-#endif
-#ifdef HAS_SCALEFILTERCOLS_MSA
-CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15)
-#endif
-#ifdef HAS_SCALEARGBCOLS_NEON
-CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
-#endif
-#ifdef HAS_SCALEARGBCOLS_MSA
-CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3)
-#endif
-#ifdef HAS_SCALEARGBCOLS_MMI
-CANY(ScaleARGBCols_Any_MMI, ScaleARGBCols_MMI, ScaleARGBCols_C, 4, 0)
-#endif
-#ifdef HAS_SCALEARGBFILTERCOLS_NEON
-CANY(ScaleARGBFilterCols_Any_NEON,
- ScaleARGBFilterCols_NEON,
- ScaleARGBFilterCols_C,
- 4,
- 3)
-#endif
-#ifdef HAS_SCALEARGBFILTERCOLS_MSA
-CANY(ScaleARGBFilterCols_Any_MSA,
- ScaleARGBFilterCols_MSA,
- ScaleARGBFilterCols_C,
- 4,
- 7)
-#endif
-#undef CANY
-
// Fixed scale down.
// Mask may be non-power of 2, so use MOD
#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
@@ -113,6 +70,22 @@ SDODD(ScaleRowDown2Box_Odd_SSSE3,
1,
15)
#endif
+#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3
+SDANY(ScaleUVRowDown2Box_Any_SSSE3,
+ ScaleUVRowDown2Box_SSSE3,
+ ScaleUVRowDown2Box_C,
+ 2,
+ 2,
+ 4)
+#endif
+#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2
+SDANY(ScaleUVRowDown2Box_Any_AVX2,
+ ScaleUVRowDown2Box_AVX2,
+ ScaleUVRowDown2Box_C,
+ 2,
+ 2,
+ 8)
+#endif
#ifdef HAS_SCALEROWDOWN2_AVX2
SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
SDANY(ScaleRowDown2Linear_Any_AVX2,
@@ -155,6 +128,15 @@ SDODD(ScaleRowDown2Box_Odd_NEON,
1,
15)
#endif
+#ifdef HAS_SCALEUVROWDOWN2BOX_NEON
+SDANY(ScaleUVRowDown2Box_Any_NEON,
+ ScaleUVRowDown2Box_NEON,
+ ScaleUVRowDown2Box_C,
+ 2,
+ 2,
+ 8)
+#endif
+
#ifdef HAS_SCALEROWDOWN2_MSA
SDANY(ScaleRowDown2_Any_MSA, ScaleRowDown2_MSA, ScaleRowDown2_C, 2, 1, 31)
SDANY(ScaleRowDown2Linear_Any_MSA,
@@ -508,6 +490,13 @@ SDAANY(ScaleARGBRowDownEvenBox_Any_MMI,
4,
1)
#endif
+#ifdef HAS_SCALEUVROWDOWNEVEN_NEON
+SDAANY(ScaleUVRowDownEven_Any_NEON,
+ ScaleUVRowDownEven_NEON,
+ ScaleUVRowDownEven_C,
+ 2,
+ 3)
+#endif
#ifdef SASIMDONLY
// This also works and uses memcpy and SIMD instead of C, but is slower on ARM
@@ -577,6 +566,49 @@ SAANY(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, ScaleAddRow_C, 7)
#endif // SASIMDONLY
+// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
+#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \
+ void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, \
+ int dx) { \
+ int r = dst_width & MASK; \
+ int n = dst_width & ~MASK; \
+ if (n > 0) { \
+ TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \
+ } \
+ TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx); \
+ }
+
+#ifdef HAS_SCALEFILTERCOLS_NEON
+CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
+#endif
+#ifdef HAS_SCALEFILTERCOLS_MSA
+CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15)
+#endif
+#ifdef HAS_SCALEARGBCOLS_NEON
+CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
+#endif
+#ifdef HAS_SCALEARGBCOLS_MSA
+CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3)
+#endif
+#ifdef HAS_SCALEARGBCOLS_MMI
+CANY(ScaleARGBCols_Any_MMI, ScaleARGBCols_MMI, ScaleARGBCols_C, 4, 0)
+#endif
+#ifdef HAS_SCALEARGBFILTERCOLS_NEON
+CANY(ScaleARGBFilterCols_Any_NEON,
+ ScaleARGBFilterCols_NEON,
+ ScaleARGBFilterCols_C,
+ 4,
+ 3)
+#endif
+#ifdef HAS_SCALEARGBFILTERCOLS_MSA
+CANY(ScaleARGBFilterCols_Any_MSA,
+ ScaleARGBFilterCols_MSA,
+ ScaleARGBFilterCols_C,
+ 4,
+ 7)
+#endif
+#undef CANY
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/TMessagesProj/jni/third_party/libyuv/source/scale_argb.cc b/TMessagesProj/jni/third_party/libyuv/source/scale_argb.cc
index 58aa5ebbe..451d4ec4d 100644
--- a/TMessagesProj/jni/third_party/libyuv/source/scale_argb.cc
+++ b/TMessagesProj/jni/third_party/libyuv/source/scale_argb.cc
@@ -95,22 +95,6 @@ static void ScaleARGBDown2(int src_width,
}
}
#endif
-#if defined(HAS_SCALEARGBROWDOWN2_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ScaleARGBRowDown2 =
- filtering == kFilterNone
- ? ScaleARGBRowDown2_Any_MSA
- : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_MSA
- : ScaleARGBRowDown2Box_Any_MSA);
- if (IS_ALIGNED(dst_width, 4)) {
- ScaleARGBRowDown2 =
- filtering == kFilterNone
- ? ScaleARGBRowDown2_MSA
- : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_MSA
- : ScaleARGBRowDown2Box_MSA);
- }
- }
-#endif
#if defined(HAS_SCALEARGBROWDOWN2_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ScaleARGBRowDown2 =
@@ -127,6 +111,22 @@ static void ScaleARGBDown2(int src_width,
}
}
#endif
+#if defined(HAS_SCALEARGBROWDOWN2_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleARGBRowDown2 =
+ filtering == kFilterNone
+ ? ScaleARGBRowDown2_Any_MSA
+ : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_MSA
+ : ScaleARGBRowDown2Box_Any_MSA);
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBRowDown2 =
+ filtering == kFilterNone
+ ? ScaleARGBRowDown2_MSA
+ : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_MSA
+ : ScaleARGBRowDown2Box_MSA);
+ }
+ }
+#endif
if (filtering == kFilterLinear) {
src_stride = 0;
@@ -243,16 +243,6 @@ static void ScaleARGBDownEven(int src_width,
}
}
#endif
-#if defined(HAS_SCALEARGBROWDOWNEVEN_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_MSA
- : ScaleARGBRowDownEven_Any_MSA;
- if (IS_ALIGNED(dst_width, 4)) {
- ScaleARGBRowDownEven =
- filtering ? ScaleARGBRowDownEvenBox_MSA : ScaleARGBRowDownEven_MSA;
- }
- }
-#endif
#if defined(HAS_SCALEARGBROWDOWNEVEN_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_MMI
@@ -263,6 +253,16 @@ static void ScaleARGBDownEven(int src_width,
}
}
#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_MSA
+ : ScaleARGBRowDownEven_Any_MSA;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBRowDownEven =
+ filtering ? ScaleARGBRowDownEvenBox_MSA : ScaleARGBRowDownEven_MSA;
+ }
+ }
+#endif
if (filtering == kFilterLinear) {
src_stride = 0;
@@ -436,14 +436,6 @@ static void ScaleARGBBilinearUp(int src_width,
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- InterpolateRow = InterpolateRow_Any_MSA;
- if (IS_ALIGNED(dst_width, 8)) {
- InterpolateRow = InterpolateRow_MSA;
- }
- }
-#endif
#if defined(HAS_INTERPOLATEROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
InterpolateRow = InterpolateRow_Any_MMI;
@@ -451,6 +443,14 @@ static void ScaleARGBBilinearUp(int src_width,
InterpolateRow = InterpolateRow_MMI;
}
}
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(dst_width, 8)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
#endif
if (src_width >= 32768) {
ScaleARGBFilterCols =
@@ -490,14 +490,6 @@ static void ScaleARGBBilinearUp(int src_width,
}
}
#endif
-#if defined(HAS_SCALEARGBCOLS_MSA)
- if (!filtering && TestCpuFlag(kCpuHasMSA)) {
- ScaleARGBFilterCols = ScaleARGBCols_Any_MSA;
- if (IS_ALIGNED(dst_width, 4)) {
- ScaleARGBFilterCols = ScaleARGBCols_MSA;
- }
- }
-#endif
#if defined(HAS_SCALEARGBCOLS_MMI)
if (!filtering && TestCpuFlag(kCpuHasMMI)) {
ScaleARGBFilterCols = ScaleARGBCols_Any_MMI;
@@ -505,6 +497,14 @@ static void ScaleARGBBilinearUp(int src_width,
ScaleARGBFilterCols = ScaleARGBCols_MMI;
}
}
+#endif
+#if defined(HAS_SCALEARGBCOLS_MSA)
+ if (!filtering && TestCpuFlag(kCpuHasMSA)) {
+ ScaleARGBFilterCols = ScaleARGBCols_Any_MSA;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBFilterCols = ScaleARGBCols_MSA;
+ }
+ }
#endif
if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
ScaleARGBFilterCols = ScaleARGBColsUp2_C;
@@ -619,14 +619,6 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
}
}
#endif
-#if defined(HAS_I422TOARGBROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToARGBRow = I422ToARGBRow_Any_MSA;
- if (IS_ALIGNED(src_width, 8)) {
- I422ToARGBRow = I422ToARGBRow_MSA;
- }
- }
-#endif
#if defined(HAS_I422TOARGBROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
I422ToARGBRow = I422ToARGBRow_Any_MMI;
@@ -635,6 +627,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
}
}
#endif
+#if defined(HAS_I422TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToARGBRow = I422ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(src_width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_MSA;
+ }
+ }
+#endif
void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
ptrdiff_t src_stride, int dst_width,
@@ -713,14 +713,6 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
}
}
#endif
-#if defined(HAS_SCALEARGBCOLS_MSA)
- if (!filtering && TestCpuFlag(kCpuHasMSA)) {
- ScaleARGBFilterCols = ScaleARGBCols_Any_MSA;
- if (IS_ALIGNED(dst_width, 4)) {
- ScaleARGBFilterCols = ScaleARGBCols_MSA;
- }
- }
-#endif
#if defined(HAS_SCALEARGBCOLS_MMI)
if (!filtering && TestCpuFlag(kCpuHasMMI)) {
ScaleARGBFilterCols = ScaleARGBCols_Any_MMI;
@@ -728,6 +720,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
ScaleARGBFilterCols = ScaleARGBCols_MMI;
}
}
+#endif
+#if defined(HAS_SCALEARGBCOLS_MSA)
+ if (!filtering && TestCpuFlag(kCpuHasMSA)) {
+ ScaleARGBFilterCols = ScaleARGBCols_Any_MSA;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBFilterCols = ScaleARGBCols_MSA;
+ }
+ }
#endif
if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
ScaleARGBFilterCols = ScaleARGBColsUp2_C;
@@ -857,14 +857,6 @@ static void ScaleARGBSimple(int src_width,
}
}
#endif
-#if defined(HAS_SCALEARGBCOLS_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ScaleARGBCols = ScaleARGBCols_Any_MSA;
- if (IS_ALIGNED(dst_width, 4)) {
- ScaleARGBCols = ScaleARGBCols_MSA;
- }
- }
-#endif
#if defined(HAS_SCALEARGBCOLS_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ScaleARGBCols = ScaleARGBCols_Any_MMI;
@@ -872,6 +864,14 @@ static void ScaleARGBSimple(int src_width,
ScaleARGBCols = ScaleARGBCols_MMI;
}
}
+#endif
+#if defined(HAS_SCALEARGBCOLS_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleARGBCols = ScaleARGBCols_Any_MSA;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBCols = ScaleARGBCols_MSA;
+ }
+ }
#endif
if (src_width * 2 == dst_width && x < 0x8000) {
ScaleARGBCols = ScaleARGBColsUp2_C;
@@ -981,7 +981,7 @@ static void ScaleARGB(const uint8_t* src,
}
}
if (dx == 0x10000 && (x & 0xffff) == 0) {
- // Arbitrary scale vertically, but unscaled vertically.
+ // Arbitrary scale vertically, but unscaled horizontally.
ScalePlaneVertical(src_height, clip_width, clip_height, src_stride,
dst_stride, src, dst, x, y, dy, 4, filtering);
return;
diff --git a/TMessagesProj/jni/third_party/libyuv/source/scale_common.cc b/TMessagesProj/jni/third_party/libyuv/source/scale_common.cc
index 636902717..fd4cbd038 100644
--- a/TMessagesProj/jni/third_party/libyuv/source/scale_common.cc
+++ b/TMessagesProj/jni/third_party/libyuv/source/scale_common.cc
@@ -776,6 +776,8 @@ void ScaleAddRow_16_C(const uint16_t* src_ptr,
}
}
+// ARGB scale row functions
+
void ScaleARGBRowDown2_C(const uint8_t* src_argb,
ptrdiff_t src_stride,
uint8_t* dst_argb,
@@ -1018,6 +1020,235 @@ void ScaleARGBFilterCols64_C(uint8_t* dst_argb,
#undef BLENDERC
#undef BLENDER
+// UV scale row functions
+// same as ARGB but 2 channels
+
+void ScaleUVRowDown2_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width) {
+ const uint16_t* src = (const uint16_t*)(src_uv);
+ uint16_t* dst = (uint16_t*)(dst_uv);
+ int x;
+ (void)src_stride;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = src[1];
+ dst[1] = src[3];
+ src += 2;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[1];
+ }
+}
+
+void ScaleUVRowDown2Linear_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width) {
+ int x;
+ (void)src_stride;
+ for (x = 0; x < dst_width; ++x) {
+ dst_uv[0] = (src_uv[0] + src_uv[2] + 1) >> 1;
+ dst_uv[1] = (src_uv[1] + src_uv[3] + 1) >> 1;
+ src_uv += 4;
+ dst_uv += 2;
+ }
+}
+
+void ScaleUVRowDown2Box_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width) {
+ int x;
+ for (x = 0; x < dst_width; ++x) {
+ dst_uv[0] = (src_uv[0] + src_uv[2] + src_uv[src_stride] +
+ src_uv[src_stride + 2] + 2) >>
+ 2;
+ dst_uv[1] = (src_uv[1] + src_uv[3] + src_uv[src_stride + 1] +
+ src_uv[src_stride + 3] + 2) >>
+ 2;
+ src_uv += 4;
+ dst_uv += 2;
+ }
+}
+
+void ScaleUVRowDownEven_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width) {
+ const uint16_t* src = (const uint16_t*)(src_uv);
+ uint16_t* dst = (uint16_t*)(dst_uv);
+ (void)src_stride;
+ int x;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = src[0];
+ dst[1] = src[src_stepx];
+ src += src_stepx * 2;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[0];
+ }
+}
+
+void ScaleUVRowDownEvenBox_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width) {
+ int x;
+ for (x = 0; x < dst_width; ++x) {
+ dst_uv[0] = (src_uv[0] + src_uv[2] + src_uv[src_stride] +
+ src_uv[src_stride + 2] + 2) >>
+ 2;
+ dst_uv[1] = (src_uv[1] + src_uv[3] + src_uv[src_stride + 1] +
+ src_uv[src_stride + 3] + 2) >>
+ 2;
+ src_uv += src_stepx * 2;
+ dst_uv += 2;
+ }
+}
+
+// Scales a single row of pixels using point sampling.
+void ScaleUVCols_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int x,
+ int dx) {
+ const uint16_t* src = (const uint16_t*)(src_uv);
+ uint16_t* dst = (uint16_t*)(dst_uv);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst[0] = src[x >> 16];
+ x += dx;
+ dst[1] = src[x >> 16];
+ x += dx;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[x >> 16];
+ }
+}
+
+void ScaleUVCols64_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int x32,
+ int dx) {
+ int64_t x = (int64_t)(x32);
+ const uint16_t* src = (const uint16_t*)(src_uv);
+ uint16_t* dst = (uint16_t*)(dst_uv);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst[0] = src[x >> 16];
+ x += dx;
+ dst[1] = src[x >> 16];
+ x += dx;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[x >> 16];
+ }
+}
+
+// Scales a single row of pixels up by 2x using point sampling.
+void ScaleUVColsUp2_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int x,
+ int dx) {
+ const uint16_t* src = (const uint16_t*)(src_uv);
+ uint16_t* dst = (uint16_t*)(dst_uv);
+ int j;
+ (void)x;
+ (void)dx;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst[1] = dst[0] = src[0];
+ src += 1;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[0];
+ }
+}
+
+// TODO(fbarchard): Replace 0x7f ^ f with 128-f. bug=607.
+// Mimics SSSE3 blender
+#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7
+#define BLENDERC(a, b, f, s) \
+ (uint16_t)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
+#define BLENDER(a, b, f) BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0)
+
+void ScaleUVFilterCols_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int x,
+ int dx) {
+ const uint16_t* src = (const uint16_t*)(src_uv);
+ uint16_t* dst = (uint16_t*)(dst_uv);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ int xi = x >> 16;
+ int xf = (x >> 9) & 0x7f;
+ uint16_t a = src[xi];
+ uint16_t b = src[xi + 1];
+ dst[0] = BLENDER(a, b, xf);
+ x += dx;
+ xi = x >> 16;
+ xf = (x >> 9) & 0x7f;
+ a = src[xi];
+ b = src[xi + 1];
+ dst[1] = BLENDER(a, b, xf);
+ x += dx;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ int xi = x >> 16;
+ int xf = (x >> 9) & 0x7f;
+ uint16_t a = src[xi];
+ uint16_t b = src[xi + 1];
+ dst[0] = BLENDER(a, b, xf);
+ }
+}
+
+void ScaleUVFilterCols64_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int x32,
+ int dx) {
+ int64_t x = (int64_t)(x32);
+ const uint16_t* src = (const uint16_t*)(src_uv);
+ uint16_t* dst = (uint16_t*)(dst_uv);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ int64_t xi = x >> 16;
+ int xf = (x >> 9) & 0x7f;
+ uint16_t a = src[xi];
+ uint16_t b = src[xi + 1];
+ dst[0] = BLENDER(a, b, xf);
+ x += dx;
+ xi = x >> 16;
+ xf = (x >> 9) & 0x7f;
+ a = src[xi];
+ b = src[xi + 1];
+ dst[1] = BLENDER(a, b, xf);
+ x += dx;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ int64_t xi = x >> 16;
+ int xf = (x >> 9) & 0x7f;
+ uint16_t a = src[xi];
+ uint16_t b = src[xi + 1];
+ dst[0] = BLENDER(a, b, xf);
+ }
+}
+#undef BLENDER1
+#undef BLENDERC
+#undef BLENDER
+
// Scale plane vertically with bilinear interpolation.
void ScalePlaneVertical(int src_height,
int dst_width,
@@ -1067,14 +1298,6 @@ void ScalePlaneVertical(int src_height,
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- InterpolateRow = InterpolateRow_Any_MSA;
- if (IS_ALIGNED(dst_width_bytes, 32)) {
- InterpolateRow = InterpolateRow_MSA;
- }
- }
-#endif
#if defined(HAS_INTERPOLATEROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
InterpolateRow = InterpolateRow_Any_MMI;
@@ -1082,6 +1305,14 @@ void ScalePlaneVertical(int src_height,
InterpolateRow = InterpolateRow_MMI;
}
}
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(dst_width_bytes, 32)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
#endif
for (j = 0; j < dst_height; ++j) {
int yi;
diff --git a/TMessagesProj/jni/third_party/libyuv/source/scale_gcc.cc b/TMessagesProj/jni/third_party/libyuv/source/scale_gcc.cc
index 90a49f30d..e575ee18b 100644
--- a/TMessagesProj/jni/third_party/libyuv/source/scale_gcc.cc
+++ b/TMessagesProj/jni/third_party/libyuv/source/scale_gcc.cc
@@ -102,16 +102,16 @@ void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
// 16 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -125,25 +125,25 @@ void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrlw $0xf,%%xmm4 \n"
- "packuswb %%xmm4,%%xmm4 \n"
- "pxor %%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrlw $0xf,%%xmm4 \n"
+ "packuswb %%xmm4,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pavgw %%xmm5,%%xmm0 \n"
- "pavgw %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pavgw %%xmm5,%%xmm0 \n"
+ "pavgw %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -156,33 +156,33 @@ void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrlw $0xf,%%xmm4 \n"
- "packuswb %%xmm4,%%xmm4 \n"
- "pxor %%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrlw $0xf,%%xmm4 \n"
+ "packuswb %%xmm4,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x00(%0,%3,1),%%xmm2 \n"
- "movdqu 0x10(%0,%3,1),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm3,%%xmm1 \n"
- "psrlw $0x1,%%xmm0 \n"
- "psrlw $0x1,%%xmm1 \n"
- "pavgw %%xmm5,%%xmm0 \n"
- "pavgw %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%3,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "psrlw $0x1,%%xmm0 \n"
+ "psrlw $0x1,%%xmm1 \n"
+ "pavgw %%xmm5,%%xmm0 \n"
+ "pavgw %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -200,17 +200,17 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
@@ -225,26 +225,26 @@ void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
- "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
- "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
+ "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
- "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
- "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
@@ -258,34 +258,34 @@ void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
- "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
+ "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
- "vmovdqu 0x20(%0,%3,1),%%ymm3 \n"
- "lea 0x40(%0),%0 \n"
- "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
- "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
- "vpsrlw $0x1,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x1,%%ymm1,%%ymm1 \n"
- "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
- "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
+ "vmovdqu 0x20(%0,%3,1),%%ymm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpsrlw $0x1,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x1,%%ymm1,%%ymm1 \n"
+ "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
@@ -301,24 +301,24 @@ void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrld $0x18,%%xmm5 \n"
- "pslld $0x10,%%xmm5 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrld $0x18,%%xmm5 \n"
+ "pslld $0x10,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -332,46 +332,46 @@ void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
int dst_width) {
intptr_t stridex3;
asm volatile(
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrlw $0xf,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "packuswb %%xmm4,%%xmm4 \n"
- "psllw $0x3,%%xmm5 \n"
- "lea 0x00(%4,%4,2),%3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrlw $0xf,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "packuswb %%xmm4,%%xmm4 \n"
+ "psllw $0x3,%%xmm5 \n"
+ "lea 0x00(%4,%4,2),%3 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x00(%0,%4,1),%%xmm2 \n"
- "movdqu 0x10(%0,%4,1),%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm3,%%xmm1 \n"
- "movdqu 0x00(%0,%4,2),%%xmm2 \n"
- "movdqu 0x10(%0,%4,2),%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm3,%%xmm1 \n"
- "movdqu 0x00(%0,%3,1),%%xmm2 \n"
- "movdqu 0x10(%0,%3,1),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm3,%%xmm1 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "paddw %%xmm5,%%xmm0 \n"
- "psrlw $0x4,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "movdqu 0x00(%0,%4,2),%%xmm2 \n"
+ "movdqu 0x10(%0,%4,2),%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%3,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm5,%%xmm0 \n"
+ "psrlw $0x4,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
@@ -387,26 +387,26 @@ void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrld $0x18,%%ymm5,%%ymm5 \n"
- "vpslld $0x10,%%ymm5,%%ymm5 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrld $0x18,%%ymm5,%%ymm5 \n"
+ "vpslld $0x10,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
@@ -420,46 +420,46 @@ void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
- "vpsllw $0x3,%%ymm4,%%ymm5 \n"
- "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
+ "vpsllw $0x3,%%ymm4,%%ymm5 \n"
+ "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
- "vmovdqu 0x20(%0,%3,1),%%ymm3 \n"
- "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
- "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
- "vmovdqu 0x00(%0,%3,2),%%ymm2 \n"
- "vmovdqu 0x20(%0,%3,2),%%ymm3 \n"
- "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
- "vmovdqu 0x00(%0,%4,1),%%ymm2 \n"
- "vmovdqu 0x20(%0,%4,1),%%ymm3 \n"
- "lea 0x40(%0),%0 \n"
- "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
- "vphaddw %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x4,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
+ "vmovdqu 0x20(%0,%3,1),%%ymm3 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vmovdqu 0x00(%0,%3,2),%%ymm2 \n"
+ "vmovdqu 0x20(%0,%3,2),%%ymm3 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vmovdqu 0x00(%0,%4,1),%%ymm2 \n"
+ "vmovdqu 0x20(%0,%4,1),%%ymm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vphaddw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x4,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
@@ -476,9 +476,9 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
- "movdqa %0,%%xmm3 \n"
- "movdqa %1,%%xmm4 \n"
- "movdqa %2,%%xmm5 \n"
+ "movdqa %0,%%xmm3 \n"
+ "movdqa %1,%%xmm4 \n"
+ "movdqa %2,%%xmm5 \n"
:
: "m"(kShuf0), // %0
"m"(kShuf1), // %1
@@ -488,20 +488,20 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm2 \n"
- "lea 0x20(%0),%0 \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "palignr $0x8,%%xmm0,%%xmm1 \n"
- "pshufb %%xmm3,%%xmm0 \n"
- "pshufb %%xmm4,%%xmm1 \n"
- "pshufb %%xmm5,%%xmm2 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,0x8(%1) \n"
- "movq %%xmm2,0x10(%1) \n"
- "lea 0x18(%1),%1 \n"
- "sub $0x18,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm2 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "palignr $0x8,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm3,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm2 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x8(%1) \n"
+ "movq %%xmm2,0x10(%1) \n"
+ "lea 0x18(%1),%1 \n"
+ "sub $0x18,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -514,18 +514,18 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "movdqa %0,%%xmm2 \n" // kShuf01
- "movdqa %1,%%xmm3 \n" // kShuf11
- "movdqa %2,%%xmm4 \n" // kShuf21
+ "movdqa %0,%%xmm2 \n" // kShuf01
+ "movdqa %1,%%xmm3 \n" // kShuf11
+ "movdqa %2,%%xmm4 \n" // kShuf21
:
: "m"(kShuf01), // %0
"m"(kShuf11), // %1
"m"(kShuf21) // %2
);
asm volatile(
- "movdqa %0,%%xmm5 \n" // kMadd01
- "movdqa %1,%%xmm0 \n" // kMadd11
- "movdqa %2,%%xmm1 \n" // kRound34
+ "movdqa %0,%%xmm5 \n" // kMadd01
+ "movdqa %1,%%xmm0 \n" // kMadd11
+ "movdqa %2,%%xmm1 \n" // kRound34
:
: "m"(kMadd01), // %0
"m"(kMadd11), // %1
@@ -535,37 +535,37 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm6 \n"
- "movdqu 0x00(%0,%3,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm5,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6,(%1) \n"
- "movdqu 0x8(%0),%%xmm6 \n"
- "movdqu 0x8(%0,%3,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm3,%%xmm6 \n"
- "pmaddubsw %%xmm0,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6,0x8(%1) \n"
- "movdqu 0x10(%0),%%xmm6 \n"
- "movdqu 0x10(%0,%3,1),%%xmm7 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm4,%%xmm6 \n"
- "pmaddubsw %4,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6,0x10(%1) \n"
- "lea 0x18(%1),%1 \n"
- "sub $0x18,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm5,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,(%1) \n"
+ "movdqu 0x8(%0),%%xmm6 \n"
+ "movdqu 0x8(%0,%3,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm3,%%xmm6 \n"
+ "pmaddubsw %%xmm0,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,0x8(%1) \n"
+ "movdqu 0x10(%0),%%xmm6 \n"
+ "movdqu 0x10(%0,%3,1),%%xmm7 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm4,%%xmm6 \n"
+ "pmaddubsw %4,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,0x10(%1) \n"
+ "lea 0x18(%1),%1 \n"
+ "sub $0x18,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -580,18 +580,18 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "movdqa %0,%%xmm2 \n" // kShuf01
- "movdqa %1,%%xmm3 \n" // kShuf11
- "movdqa %2,%%xmm4 \n" // kShuf21
+ "movdqa %0,%%xmm2 \n" // kShuf01
+ "movdqa %1,%%xmm3 \n" // kShuf11
+ "movdqa %2,%%xmm4 \n" // kShuf21
:
: "m"(kShuf01), // %0
"m"(kShuf11), // %1
"m"(kShuf21) // %2
);
asm volatile(
- "movdqa %0,%%xmm5 \n" // kMadd01
- "movdqa %1,%%xmm0 \n" // kMadd11
- "movdqa %2,%%xmm1 \n" // kRound34
+ "movdqa %0,%%xmm5 \n" // kMadd01
+ "movdqa %1,%%xmm0 \n" // kMadd11
+ "movdqa %2,%%xmm1 \n" // kRound34
:
: "m"(kMadd01), // %0
"m"(kMadd11), // %1
@@ -602,40 +602,40 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm6 \n"
- "movdqu 0x00(%0,%3,1),%%xmm7 \n"
- "pavgb %%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm5,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6,(%1) \n"
- "movdqu 0x8(%0),%%xmm6 \n"
- "movdqu 0x8(%0,%3,1),%%xmm7 \n"
- "pavgb %%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm3,%%xmm6 \n"
- "pmaddubsw %%xmm0,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6,0x8(%1) \n"
- "movdqu 0x10(%0),%%xmm6 \n"
- "movdqu 0x10(%0,%3,1),%%xmm7 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm4,%%xmm6 \n"
- "pmaddubsw %4,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6,0x10(%1) \n"
- "lea 0x18(%1),%1 \n"
- "sub $0x18,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm7 \n"
+ "pavgb %%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm5,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,(%1) \n"
+ "movdqu 0x8(%0),%%xmm6 \n"
+ "movdqu 0x8(%0,%3,1),%%xmm7 \n"
+ "pavgb %%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm3,%%xmm6 \n"
+ "pmaddubsw %%xmm0,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,0x8(%1) \n"
+ "movdqu 0x10(%0),%%xmm6 \n"
+ "movdqu 0x10(%0,%3,1),%%xmm7 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm4,%%xmm6 \n"
+ "pmaddubsw %4,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,0x10(%1) \n"
+ "lea 0x18(%1),%1 \n"
+ "sub $0x18,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -651,23 +651,23 @@ void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "movhlps %%xmm0,%%xmm1 \n"
- "movd %%xmm1,0x8(%1) \n"
- "lea 0xc(%1),%1 \n"
- "sub $0xc,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "movhlps %%xmm0,%%xmm1 \n"
+ "movd %%xmm1,0x8(%1) \n"
+ "lea 0xc(%1),%1 \n"
+ "sub $0xc,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -681,10 +681,10 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "movdqa %0,%%xmm2 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm4 \n"
- "movdqa %3,%%xmm5 \n"
+ "movdqa %0,%%xmm2 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm4 \n"
+ "movdqa %3,%%xmm5 \n"
:
: "m"(kShufAb0), // %0
"m"(kShufAb1), // %1
@@ -695,25 +695,25 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%3,1),%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "pavgb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pshufb %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm6 \n"
- "pshufb %%xmm3,%%xmm6 \n"
- "paddusw %%xmm6,%%xmm1 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "paddusw %%xmm0,%%xmm1 \n"
- "pmulhuw %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movd %%xmm1,(%1) \n"
- "psrlq $0x10,%%xmm1 \n"
- "movd %%xmm1,0x2(%1) \n"
- "lea 0x6(%1),%1 \n"
- "sub $0x6,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pshufb %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm6 \n"
+ "pshufb %%xmm3,%%xmm6 \n"
+ "paddusw %%xmm6,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "paddusw %%xmm0,%%xmm1 \n"
+ "pmulhuw %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movd %%xmm1,(%1) \n"
+ "psrlq $0x10,%%xmm1 \n"
+ "movd %%xmm1,0x2(%1) \n"
+ "lea 0x6(%1),%1 \n"
+ "sub $0x6,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -726,10 +726,10 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "movdqa %0,%%xmm2 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm4 \n"
- "pxor %%xmm5,%%xmm5 \n"
+ "movdqa %0,%%xmm2 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
:
: "m"(kShufAc), // %0
"m"(kShufAc3), // %1
@@ -739,44 +739,44 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%3,1),%%xmm6 \n"
- "movhlps %%xmm0,%%xmm1 \n"
- "movhlps %%xmm6,%%xmm7 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm6 \n"
- "punpcklbw %%xmm5,%%xmm7 \n"
- "paddusw %%xmm6,%%xmm0 \n"
- "paddusw %%xmm7,%%xmm1 \n"
- "movdqu 0x00(%0,%3,2),%%xmm6 \n"
- "lea 0x10(%0),%0 \n"
- "movhlps %%xmm6,%%xmm7 \n"
- "punpcklbw %%xmm5,%%xmm6 \n"
- "punpcklbw %%xmm5,%%xmm7 \n"
- "paddusw %%xmm6,%%xmm0 \n"
- "paddusw %%xmm7,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm6 \n"
- "psrldq $0x2,%%xmm0 \n"
- "paddusw %%xmm0,%%xmm6 \n"
- "psrldq $0x2,%%xmm0 \n"
- "paddusw %%xmm0,%%xmm6 \n"
- "pshufb %%xmm2,%%xmm6 \n"
- "movdqa %%xmm1,%%xmm7 \n"
- "psrldq $0x2,%%xmm1 \n"
- "paddusw %%xmm1,%%xmm7 \n"
- "psrldq $0x2,%%xmm1 \n"
- "paddusw %%xmm1,%%xmm7 \n"
- "pshufb %%xmm3,%%xmm7 \n"
- "paddusw %%xmm7,%%xmm6 \n"
- "pmulhuw %%xmm4,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movd %%xmm6,(%1) \n"
- "psrlq $0x10,%%xmm6 \n"
- "movd %%xmm6,0x2(%1) \n"
- "lea 0x6(%1),%1 \n"
- "sub $0x6,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm6 \n"
+ "movhlps %%xmm0,%%xmm1 \n"
+ "movhlps %%xmm6,%%xmm7 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm6 \n"
+ "punpcklbw %%xmm5,%%xmm7 \n"
+ "paddusw %%xmm6,%%xmm0 \n"
+ "paddusw %%xmm7,%%xmm1 \n"
+ "movdqu 0x00(%0,%3,2),%%xmm6 \n"
+ "lea 0x10(%0),%0 \n"
+ "movhlps %%xmm6,%%xmm7 \n"
+ "punpcklbw %%xmm5,%%xmm6 \n"
+ "punpcklbw %%xmm5,%%xmm7 \n"
+ "paddusw %%xmm6,%%xmm0 \n"
+ "paddusw %%xmm7,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm6 \n"
+ "psrldq $0x2,%%xmm0 \n"
+ "paddusw %%xmm0,%%xmm6 \n"
+ "psrldq $0x2,%%xmm0 \n"
+ "paddusw %%xmm0,%%xmm6 \n"
+ "pshufb %%xmm2,%%xmm6 \n"
+ "movdqa %%xmm1,%%xmm7 \n"
+ "psrldq $0x2,%%xmm1 \n"
+ "paddusw %%xmm1,%%xmm7 \n"
+ "psrldq $0x2,%%xmm1 \n"
+ "paddusw %%xmm1,%%xmm7 \n"
+ "pshufb %%xmm3,%%xmm7 \n"
+ "paddusw %%xmm7,%%xmm6 \n"
+ "pmulhuw %%xmm4,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movd %%xmm6,(%1) \n"
+ "psrlq $0x10,%%xmm6 \n"
+ "movd %%xmm6,0x2(%1) \n"
+ "lea 0x6(%1),%1 \n"
+ "sub $0x6,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -791,25 +791,25 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr,
int src_width) {
asm volatile(
- "pxor %%xmm5,%%xmm5 \n"
+ "pxor %%xmm5,%%xmm5 \n"
// 16 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm3 \n"
- "lea 0x10(%0),%0 \n" // src_ptr += 16
- "movdqu (%1),%%xmm0 \n"
- "movdqu 0x10(%1),%%xmm1 \n"
- "movdqa %%xmm3,%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "punpckhbw %%xmm5,%%xmm3 \n"
- "paddusw %%xmm2,%%xmm0 \n"
- "paddusw %%xmm3,%%xmm1 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm3 \n"
+ "lea 0x10(%0),%0 \n" // src_ptr += 16
+ "movdqu (%1),%%xmm0 \n"
+ "movdqu 0x10(%1),%%xmm1 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpckhbw %%xmm5,%%xmm3 \n"
+ "paddusw %%xmm2,%%xmm0 \n"
+ "paddusw %%xmm3,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(src_width) // %2
@@ -824,22 +824,22 @@ void ScaleAddRow_AVX2(const uint8_t* src_ptr,
int src_width) {
asm volatile(
- "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm3 \n"
- "lea 0x20(%0),%0 \n" // src_ptr += 32
- "vpermq $0xd8,%%ymm3,%%ymm3 \n"
- "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
- "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
- "vpaddusw (%1),%%ymm2,%%ymm0 \n"
- "vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vmovdqu %%ymm1,0x20(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm3 \n"
+ "lea 0x20(%0),%0 \n" // src_ptr += 32
+ "vpermq $0xd8,%%ymm3,%%ymm3 \n"
+ "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
+ "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpaddusw (%1),%%ymm2,%%ymm0 \n"
+ "vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
@@ -866,69 +866,69 @@ void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
int dx) {
intptr_t x0, x1, temp_pixel;
asm volatile(
- "movd %6,%%xmm2 \n"
- "movd %7,%%xmm3 \n"
- "movl $0x04040000,%k2 \n"
- "movd %k2,%%xmm5 \n"
- "pcmpeqb %%xmm6,%%xmm6 \n"
- "psrlw $0x9,%%xmm6 \n" // 0x007f007f
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "psrlw $15,%%xmm7 \n" // 0x00010001
+ "movd %6,%%xmm2 \n"
+ "movd %7,%%xmm3 \n"
+ "movl $0x04040000,%k2 \n"
+ "movd %k2,%%xmm5 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrlw $0x9,%%xmm6 \n" // 0x007f007f
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $15,%%xmm7 \n" // 0x00010001
- "pextrw $0x1,%%xmm2,%k3 \n"
- "subl $0x2,%5 \n"
- "jl 29f \n"
- "movdqa %%xmm2,%%xmm0 \n"
- "paddd %%xmm3,%%xmm0 \n"
- "punpckldq %%xmm0,%%xmm2 \n"
- "punpckldq %%xmm3,%%xmm3 \n"
- "paddd %%xmm3,%%xmm3 \n"
- "pextrw $0x3,%%xmm2,%k4 \n"
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "subl $0x2,%5 \n"
+ "jl 29f \n"
+ "movdqa %%xmm2,%%xmm0 \n"
+ "paddd %%xmm3,%%xmm0 \n"
+ "punpckldq %%xmm0,%%xmm2 \n"
+ "punpckldq %%xmm3,%%xmm3 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
LABELALIGN
"2: \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "paddd %%xmm3,%%xmm2 \n"
- "movzwl 0x00(%1,%3,1),%k2 \n"
- "movd %k2,%%xmm0 \n"
- "psrlw $0x9,%%xmm1 \n"
- "movzwl 0x00(%1,%4,1),%k2 \n"
- "movd %k2,%%xmm4 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "punpcklwd %%xmm4,%%xmm0 \n"
- "psubb %8,%%xmm0 \n" // make pixels signed.
- "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) +
+ "movdqa %%xmm2,%%xmm1 \n"
+ "paddd %%xmm3,%%xmm2 \n"
+ "movzwl 0x00(%1,%3,1),%k2 \n"
+ "movd %k2,%%xmm0 \n"
+ "psrlw $0x9,%%xmm1 \n"
+ "movzwl 0x00(%1,%4,1),%k2 \n"
+ "movd %k2,%%xmm4 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "punpcklwd %%xmm4,%%xmm0 \n"
+ "psubb %8,%%xmm0 \n" // make pixels signed.
+ "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) +
// 1
- "paddusb %%xmm7,%%xmm1 \n"
- "pmaddubsw %%xmm0,%%xmm1 \n"
- "pextrw $0x1,%%xmm2,%k3 \n"
- "pextrw $0x3,%%xmm2,%k4 \n"
- "paddw %9,%%xmm1 \n" // make pixels unsigned.
- "psrlw $0x7,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movd %%xmm1,%k2 \n"
- "mov %w2,(%0) \n"
- "lea 0x2(%0),%0 \n"
- "subl $0x2,%5 \n"
- "jge 2b \n"
+ "paddusb %%xmm7,%%xmm1 \n"
+ "pmaddubsw %%xmm0,%%xmm1 \n"
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
+ "paddw %9,%%xmm1 \n" // make pixels unsigned.
+ "psrlw $0x7,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movd %%xmm1,%k2 \n"
+ "mov %w2,(%0) \n"
+ "lea 0x2(%0),%0 \n"
+ "subl $0x2,%5 \n"
+ "jge 2b \n"
LABELALIGN
"29: \n"
- "addl $0x1,%5 \n"
- "jl 99f \n"
- "movzwl 0x00(%1,%3,1),%k2 \n"
- "movd %k2,%%xmm0 \n"
- "psrlw $0x9,%%xmm2 \n"
- "pshufb %%xmm5,%%xmm2 \n"
- "psubb %8,%%xmm0 \n" // make pixels signed.
- "pxor %%xmm6,%%xmm2 \n"
- "paddusb %%xmm7,%%xmm2 \n"
- "pmaddubsw %%xmm0,%%xmm2 \n"
- "paddw %9,%%xmm2 \n" // make pixels unsigned.
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm2 \n"
- "movd %%xmm2,%k2 \n"
- "mov %b2,(%0) \n"
+ "addl $0x1,%5 \n"
+ "jl 99f \n"
+ "movzwl 0x00(%1,%3,1),%k2 \n"
+ "movd %k2,%%xmm0 \n"
+ "psrlw $0x9,%%xmm2 \n"
+ "pshufb %%xmm5,%%xmm2 \n"
+ "psubb %8,%%xmm0 \n" // make pixels signed.
+ "pxor %%xmm6,%%xmm2 \n"
+ "paddusb %%xmm7,%%xmm2 \n"
+ "pmaddubsw %%xmm0,%%xmm2 \n"
+ "paddw %9,%%xmm2 \n" // make pixels unsigned.
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm2 \n"
+ "movd %%xmm2,%k2 \n"
+ "mov %b2,(%0) \n"
"99: \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
@@ -966,16 +966,16 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
LABELALIGN
"1: \n"
- "movdqu (%1),%%xmm0 \n"
- "lea 0x10(%1),%1 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm1 \n"
- "movdqu %%xmm0,(%0) \n"
- "movdqu %%xmm1,0x10(%0) \n"
- "lea 0x20(%0),%0 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "movdqu (%1),%%xmm0 \n"
+ "lea 0x10(%1),%1 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "movdqu %%xmm0,(%0) \n"
+ "movdqu %%xmm1,0x10(%0) \n"
+ "lea 0x20(%0),%0 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
@@ -993,14 +993,14 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "shufps $0xdd,%%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
@@ -1017,17 +1017,17 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm2 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
@@ -1043,21 +1043,21 @@ void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x00(%0,%3,1),%%xmm2 \n"
- "movdqu 0x10(%0,%3,1),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm2 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%3,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
@@ -1076,23 +1076,23 @@ void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
intptr_t src_stepx_x12;
(void)src_stride;
asm volatile(
- "lea 0x00(,%1,4),%1 \n"
- "lea 0x00(%1,%1,2),%4 \n"
+ "lea 0x00(,%1,4),%1 \n"
+ "lea 0x00(%1,%1,2),%4 \n"
LABELALIGN
"1: \n"
- "movd (%0),%%xmm0 \n"
- "movd 0x00(%0,%1,1),%%xmm1 \n"
- "punpckldq %%xmm1,%%xmm0 \n"
- "movd 0x00(%0,%1,2),%%xmm2 \n"
- "movd 0x00(%0,%4,1),%%xmm3 \n"
- "lea 0x00(%0,%1,4),%0 \n"
- "punpckldq %%xmm3,%%xmm2 \n"
- "punpcklqdq %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%3 \n"
- "jg 1b \n"
+ "movd (%0),%%xmm0 \n"
+ "movd 0x00(%0,%1,1),%%xmm1 \n"
+ "punpckldq %%xmm1,%%xmm0 \n"
+ "movd 0x00(%0,%1,2),%%xmm2 \n"
+ "movd 0x00(%0,%4,1),%%xmm3 \n"
+ "lea 0x00(%0,%1,4),%0 \n"
+ "punpckldq %%xmm3,%%xmm2 \n"
+ "punpcklqdq %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(src_stepx_x4), // %1
"+r"(dst_argb), // %2
@@ -1113,32 +1113,32 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
intptr_t src_stepx_x12;
intptr_t row1 = (intptr_t)(src_stride);
asm volatile(
- "lea 0x00(,%1,4),%1 \n"
- "lea 0x00(%1,%1,2),%4 \n"
- "lea 0x00(%0,%5,1),%5 \n"
+ "lea 0x00(,%1,4),%1 \n"
+ "lea 0x00(%1,%1,2),%4 \n"
+ "lea 0x00(%0,%5,1),%5 \n"
LABELALIGN
"1: \n"
- "movq (%0),%%xmm0 \n"
- "movhps 0x00(%0,%1,1),%%xmm0 \n"
- "movq 0x00(%0,%1,2),%%xmm1 \n"
- "movhps 0x00(%0,%4,1),%%xmm1 \n"
- "lea 0x00(%0,%1,4),%0 \n"
- "movq (%5),%%xmm2 \n"
- "movhps 0x00(%5,%1,1),%%xmm2 \n"
- "movq 0x00(%5,%1,2),%%xmm3 \n"
- "movhps 0x00(%5,%4,1),%%xmm3 \n"
- "lea 0x00(%5,%1,4),%5 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm2 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%3 \n"
- "jg 1b \n"
+ "movq (%0),%%xmm0 \n"
+ "movhps 0x00(%0,%1,1),%%xmm0 \n"
+ "movq 0x00(%0,%1,2),%%xmm1 \n"
+ "movhps 0x00(%0,%4,1),%%xmm1 \n"
+ "lea 0x00(%0,%1,4),%0 \n"
+ "movq (%5),%%xmm2 \n"
+ "movhps 0x00(%5,%1,1),%%xmm2 \n"
+ "movq 0x00(%5,%1,2),%%xmm3 \n"
+ "movhps 0x00(%5,%4,1),%%xmm3 \n"
+ "lea 0x00(%5,%1,4),%5 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(src_stepx_x4), // %1
"+r"(dst_argb), // %2
@@ -1156,56 +1156,56 @@ void ScaleARGBCols_SSE2(uint8_t* dst_argb,
int dx) {
intptr_t x0, x1;
asm volatile(
- "movd %5,%%xmm2 \n"
- "movd %6,%%xmm3 \n"
- "pshufd $0x0,%%xmm2,%%xmm2 \n"
- "pshufd $0x11,%%xmm3,%%xmm0 \n"
- "paddd %%xmm0,%%xmm2 \n"
- "paddd %%xmm3,%%xmm3 \n"
- "pshufd $0x5,%%xmm3,%%xmm0 \n"
- "paddd %%xmm0,%%xmm2 \n"
- "paddd %%xmm3,%%xmm3 \n"
- "pshufd $0x0,%%xmm3,%%xmm3 \n"
- "pextrw $0x1,%%xmm2,%k0 \n"
- "pextrw $0x3,%%xmm2,%k1 \n"
- "cmp $0x0,%4 \n"
- "jl 99f \n"
- "sub $0x4,%4 \n"
- "jl 49f \n"
+ "movd %5,%%xmm2 \n"
+ "movd %6,%%xmm3 \n"
+ "pshufd $0x0,%%xmm2,%%xmm2 \n"
+ "pshufd $0x11,%%xmm3,%%xmm0 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pshufd $0x5,%%xmm3,%%xmm0 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "pextrw $0x1,%%xmm2,%k0 \n"
+ "pextrw $0x3,%%xmm2,%k1 \n"
+ "cmp $0x0,%4 \n"
+ "jl 99f \n"
+ "sub $0x4,%4 \n"
+ "jl 49f \n"
LABELALIGN
"40: \n"
- "movd 0x00(%3,%0,4),%%xmm0 \n"
- "movd 0x00(%3,%1,4),%%xmm1 \n"
- "pextrw $0x5,%%xmm2,%k0 \n"
- "pextrw $0x7,%%xmm2,%k1 \n"
- "paddd %%xmm3,%%xmm2 \n"
- "punpckldq %%xmm1,%%xmm0 \n"
- "movd 0x00(%3,%0,4),%%xmm1 \n"
- "movd 0x00(%3,%1,4),%%xmm4 \n"
- "pextrw $0x1,%%xmm2,%k0 \n"
- "pextrw $0x3,%%xmm2,%k1 \n"
- "punpckldq %%xmm4,%%xmm1 \n"
- "punpcklqdq %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%4 \n"
- "jge 40b \n"
+ "movd 0x00(%3,%0,4),%%xmm0 \n"
+ "movd 0x00(%3,%1,4),%%xmm1 \n"
+ "pextrw $0x5,%%xmm2,%k0 \n"
+ "pextrw $0x7,%%xmm2,%k1 \n"
+ "paddd %%xmm3,%%xmm2 \n"
+ "punpckldq %%xmm1,%%xmm0 \n"
+ "movd 0x00(%3,%0,4),%%xmm1 \n"
+ "movd 0x00(%3,%1,4),%%xmm4 \n"
+ "pextrw $0x1,%%xmm2,%k0 \n"
+ "pextrw $0x3,%%xmm2,%k1 \n"
+ "punpckldq %%xmm4,%%xmm1 \n"
+ "punpcklqdq %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%4 \n"
+ "jge 40b \n"
"49: \n"
- "test $0x2,%4 \n"
- "je 29f \n"
- "movd 0x00(%3,%0,4),%%xmm0 \n"
- "movd 0x00(%3,%1,4),%%xmm1 \n"
- "pextrw $0x5,%%xmm2,%k0 \n"
- "punpckldq %%xmm1,%%xmm0 \n"
- "movq %%xmm0,(%2) \n"
- "lea 0x8(%2),%2 \n"
+ "test $0x2,%4 \n"
+ "je 29f \n"
+ "movd 0x00(%3,%0,4),%%xmm0 \n"
+ "movd 0x00(%3,%1,4),%%xmm1 \n"
+ "pextrw $0x5,%%xmm2,%k0 \n"
+ "punpckldq %%xmm1,%%xmm0 \n"
+ "movq %%xmm0,(%2) \n"
+ "lea 0x8(%2),%2 \n"
"29: \n"
- "test $0x1,%4 \n"
- "je 99f \n"
- "movd 0x00(%3,%0,4),%%xmm0 \n"
- "movd %%xmm0,(%2) \n"
+ "test $0x1,%4 \n"
+ "je 99f \n"
+ "movd 0x00(%3,%0,4),%%xmm0 \n"
+ "movd %%xmm0,(%2) \n"
"99: \n"
: "=&a"(x0), // %0
"=&d"(x1), // %1
@@ -1230,16 +1230,16 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
LABELALIGN
"1: \n"
- "movdqu (%1),%%xmm0 \n"
- "lea 0x10(%1),%1 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpckldq %%xmm0,%%xmm0 \n"
- "punpckhdq %%xmm1,%%xmm1 \n"
- "movdqu %%xmm0,(%0) \n"
- "movdqu %%xmm1,0x10(%0) \n"
- "lea 0x20(%0),%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movdqu (%1),%%xmm0 \n"
+ "lea 0x10(%1),%1 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpckldq %%xmm0,%%xmm0 \n"
+ "punpckhdq %%xmm1,%%xmm1 \n"
+ "movdqu %%xmm0,(%0) \n"
+ "movdqu %%xmm1,0x10(%0) \n"
+ "lea 0x20(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
@@ -1267,63 +1267,64 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
int dx) {
intptr_t x0, x1;
asm volatile(
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm5 \n"
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm5 \n"
:
: "m"(kShuffleColARGB), // %0
"m"(kShuffleFractions) // %1
);
asm volatile(
- "movd %5,%%xmm2 \n"
- "movd %6,%%xmm3 \n"
- "pcmpeqb %%xmm6,%%xmm6 \n"
- "psrlw $0x9,%%xmm6 \n"
- "pextrw $0x1,%%xmm2,%k3 \n"
- "sub $0x2,%2 \n"
- "jl 29f \n"
- "movdqa %%xmm2,%%xmm0 \n"
- "paddd %%xmm3,%%xmm0 \n"
- "punpckldq %%xmm0,%%xmm2 \n"
- "punpckldq %%xmm3,%%xmm3 \n"
- "paddd %%xmm3,%%xmm3 \n"
- "pextrw $0x3,%%xmm2,%k4 \n"
+ "movd %5,%%xmm2 \n"
+ "movd %6,%%xmm3 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrlw $0x9,%%xmm6 \n"
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "sub $0x2,%2 \n"
+ "jl 29f \n"
+ "movdqa %%xmm2,%%xmm0 \n"
+ "paddd %%xmm3,%%xmm0 \n"
+ "punpckldq %%xmm0,%%xmm2 \n"
+ "punpckldq %%xmm3,%%xmm3 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
LABELALIGN
"2: \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "paddd %%xmm3,%%xmm2 \n"
- "movq 0x00(%1,%3,4),%%xmm0 \n"
- "psrlw $0x9,%%xmm1 \n"
- "movhps 0x00(%1,%4,4),%%xmm0 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "pxor %%xmm6,%%xmm1 \n"
- "pmaddubsw %%xmm1,%%xmm0 \n"
- "psrlw $0x7,%%xmm0 \n"
- "pextrw $0x1,%%xmm2,%k3 \n"
- "pextrw $0x3,%%xmm2,%k4 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,(%0) \n"
- "lea 0x8(%0),%0 \n"
- "sub $0x2,%2 \n"
- "jge 2b \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "paddd %%xmm3,%%xmm2 \n"
+ "movq 0x00(%1,%3,4),%%xmm0 \n"
+ "psrlw $0x9,%%xmm1 \n"
+ "movhps 0x00(%1,%4,4),%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pxor %%xmm6,%%xmm1 \n"
+ "pmaddubsw %%xmm1,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%0) \n"
+ "lea 0x8(%0),%0 \n"
+ "sub $0x2,%2 \n"
+ "jge 2b \n"
LABELALIGN
"29: \n"
- "add $0x1,%2 \n"
- "jl 99f \n"
- "psrlw $0x9,%%xmm2 \n"
- "movq 0x00(%1,%3,4),%%xmm0 \n"
- "pshufb %%xmm5,%%xmm2 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "pxor %%xmm6,%%xmm2 \n"
- "pmaddubsw %%xmm2,%%xmm0 \n"
- "psrlw $0x7,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movd %%xmm0,(%0) \n"
+ "add $0x1,%2 \n"
+ "jl 99f \n"
+ "psrlw $0x9,%%xmm2 \n"
+ "movq 0x00(%1,%3,4),%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pxor %%xmm6,%%xmm2 \n"
+ "pmaddubsw %%xmm2,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movd %%xmm0,(%0) \n"
- LABELALIGN "99: \n" // clang-format error.
+ LABELALIGN
+ "99: \n" // clang-format error.
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
@@ -1339,10 +1340,10 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
int FixedDiv_X86(int num, int div) {
asm volatile(
"cdq \n"
- "shld $0x10,%%eax,%%edx \n"
- "shl $0x10,%%eax \n"
- "idiv %1 \n"
- "mov %0, %%eax \n"
+ "shld $0x10,%%eax,%%edx \n"
+ "shl $0x10,%%eax \n"
+ "idiv %1 \n"
+ "mov %0, %%eax \n"
: "+a"(num) // %0
: "c"(div) // %1
: "memory", "cc", "edx");
@@ -1353,19 +1354,108 @@ int FixedDiv_X86(int num, int div) {
int FixedDiv1_X86(int num, int div) {
asm volatile(
"cdq \n"
- "shld $0x10,%%eax,%%edx \n"
- "shl $0x10,%%eax \n"
- "sub $0x10001,%%eax \n"
- "sbb $0x0,%%edx \n"
- "sub $0x1,%1 \n"
- "idiv %1 \n"
- "mov %0, %%eax \n"
+ "shld $0x10,%%eax,%%edx \n"
+ "shl $0x10,%%eax \n"
+ "sub $0x10001,%%eax \n"
+ "sbb $0x0,%%edx \n"
+ "sub $0x1,%1 \n"
+ "idiv %1 \n"
+ "mov %0, %%eax \n"
: "+a"(num) // %0
: "c"(div) // %1
: "memory", "cc", "edx");
return num;
}
+#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3
+// Shuffle table for splitting UV into upper and lower part of register.
+static const uvec8 kShuffleSplitUV = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u,
+ 1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u};
+static const uvec8 kShuffleMergeUV = {0u, 8u, 2u, 10u, 4u, 12u,
+ 6u, 14u, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80};
+
+void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "pcmpeqb %%xmm4,%%xmm4 \n" // 01010101
+ "psrlw $0xf,%%xmm4 \n"
+ "packuswb %%xmm4,%%xmm4 \n"
+ "pxor %%xmm5, %%xmm5 \n" // zero
+ "movdqa %4,%%xmm1 \n" // split shuffler
+ "movdqa %5,%%xmm3 \n" // merge shuffler
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n" // 8 UV row 0
+ "movdqu 0x00(%0,%3,1),%%xmm2 \n" // 8 UV row 1
+ "lea 0x10(%0),%0 \n"
+ "pshufb %%xmm1,%%xmm0 \n" // uuuuvvvv
+ "pshufb %%xmm1,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n" // horizontal add
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "paddw %%xmm2,%%xmm0 \n" // vertical add
+ "psrlw $0x1,%%xmm0 \n" // round
+ "pavgw %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm3,%%xmm0 \n" // merge uv
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n" // 4 UV
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "m"(kShuffleSplitUV), // %4
+ "m"(kShuffleMergeUV) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif // HAS_SCALEUVROWDOWN2BOX_SSSE3
+
+#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2
+void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 01010101
+ "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
+ "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n" // zero
+ "vbroadcastf128 %4,%%ymm1 \n" // split shuffler
+ "vbroadcastf128 %5,%%ymm3 \n" // merge shuffler
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n" // 16 UV row 0
+ "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" // 16 UV row 1
+ "lea 0x20(%0),%0 \n"
+ "vpshufb %%ymm1,%%ymm0,%%ymm0 \n" // uuuuvvvv
+ "vpshufb %%ymm1,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // horizontal add
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" // vertical add
+ "vpsrlw $0x1,%%ymm0,%%ymm0 \n" // round
+ "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpshufb %%ymm3,%%ymm0,%%ymm0 \n" // merge uv
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" // combine qwords
+ "vmovdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n" // 8 UV
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "m"(kShuffleSplitUV), // %4
+ "m"(kShuffleMergeUV) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif // HAS_SCALEUVROWDOWN2BOX_AVX2
+
#endif // defined(__x86_64__) || defined(__i386__)
#ifdef __cplusplus
diff --git a/TMessagesProj/jni/third_party/libyuv/source/scale_neon.cc b/TMessagesProj/jni/third_party/libyuv/source/scale_neon.cc
index 366b155ba..572b4bfa9 100644
--- a/TMessagesProj/jni/third_party/libyuv/source/scale_neon.cc
+++ b/TMessagesProj/jni/third_party/libyuv/source/scale_neon.cc
@@ -31,10 +31,10 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
asm volatile(
"1: \n"
// load even pixels into q0, odd into q1
- "vld2.8 {q0, q1}, [%0]! \n"
- "subs %2, %2, #16 \n" // 16 processed per loop
- "vst1.8 {q1}, [%1]! \n" // store odd pixels
- "bgt 1b \n"
+ "vld2.8 {q0, q1}, [%0]! \n"
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "vst1.8 {q1}, [%1]! \n" // store odd pixels
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
"+r"(dst_width) // %2
@@ -51,11 +51,11 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
(void)src_stride;
asm volatile(
"1: \n"
- "vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels
- "subs %2, %2, #16 \n" // 16 processed per loop
- "vrhadd.u8 q0, q0, q1 \n" // rounding half add
- "vst1.8 {q0}, [%1]! \n"
- "bgt 1b \n"
+ "vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "vrhadd.u8 q0, q0, q1 \n" // rounding half add
+ "vst1.8 {q0}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
"+r"(dst_width) // %2
@@ -71,21 +71,21 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
int dst_width) {
asm volatile(
// change the stride to row 2 pointer
- "add %1, %0 \n"
+ "add %1, %0 \n"
"1: \n"
- "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
- "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
- "subs %3, %3, #16 \n" // 16 processed per loop
- "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
- "vpaddl.u8 q1, q1 \n"
- "vpadal.u8 q0, q2 \n" // row 2 add adjacent +
+ "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
+ "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
+ "subs %3, %3, #16 \n" // 16 processed per loop
+ "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
+ "vpaddl.u8 q1, q1 \n"
+ "vpadal.u8 q0, q2 \n" // row 2 add adjacent +
// row1
- "vpadal.u8 q1, q3 \n"
- "vrshrn.u16 d0, q0, #2 \n" // downshift, round and
+ "vpadal.u8 q1, q3 \n"
+ "vrshrn.u16 d0, q0, #2 \n" // downshift, round and
// pack
- "vrshrn.u16 d1, q1, #2 \n"
- "vst1.8 {q0}, [%2]! \n"
- "bgt 1b \n"
+ "vrshrn.u16 d1, q1, #2 \n"
+ "vst1.8 {q0}, [%2]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
"+r"(dst), // %2
@@ -102,10 +102,10 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr,
(void)src_stride;
asm volatile(
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
- "subs %2, %2, #8 \n" // 8 processed per loop
- "vst1.8 {d2}, [%1]! \n"
- "bgt 1b \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "vst1.8 {d2}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -122,20 +122,20 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
asm volatile(
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load up 16x4
- "vld1.8 {q1}, [%3]! \n"
- "vld1.8 {q2}, [%4]! \n"
- "vld1.8 {q3}, [%5]! \n"
- "subs %2, %2, #4 \n"
- "vpaddl.u8 q0, q0 \n"
- "vpadal.u8 q0, q1 \n"
- "vpadal.u8 q0, q2 \n"
- "vpadal.u8 q0, q3 \n"
- "vpaddl.u16 q0, q0 \n"
- "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
- "vmovn.u16 d0, q0 \n"
- "vst1.32 {d0[0]}, [%1]! \n"
- "bgt 1b \n"
+ "vld1.8 {q0}, [%0]! \n" // load up 16x4
+ "vld1.8 {q1}, [%3]! \n"
+ "vld1.8 {q2}, [%4]! \n"
+ "vld1.8 {q3}, [%5]! \n"
+ "subs %2, %2, #4 \n"
+ "vpaddl.u8 q0, q0 \n"
+ "vpadal.u8 q0, q1 \n"
+ "vpadal.u8 q0, q2 \n"
+ "vpadal.u8 q0, q3 \n"
+ "vpaddl.u16 q0, q0 \n"
+ "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
+ "vmovn.u16 d0, q0 \n"
+ "vst1.32 {d0[0]}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
@@ -156,11 +156,11 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr,
(void)src_stride;
asm volatile(
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
- "subs %2, %2, #24 \n"
- "vmov d2, d3 \n" // order d0, d1, d2
- "vst3.8 {d0, d1, d2}, [%1]! \n"
- "bgt 1b \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "subs %2, %2, #24 \n"
+ "vmov d2, d3 \n" // order d0, d1, d2
+ "vst3.8 {d0, d1, d2}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -173,49 +173,49 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "vmov.u8 d24, #3 \n"
- "add %3, %0 \n"
+ "vmov.u8 d24, #3 \n"
+ "add %3, %0 \n"
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
- "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
- "subs %2, %2, #24 \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
+ "subs %2, %2, #24 \n"
// filter src line 0 with src line 1
// expand chars to shorts to allow for room
// when adding lines together
- "vmovl.u8 q8, d4 \n"
- "vmovl.u8 q9, d5 \n"
- "vmovl.u8 q10, d6 \n"
- "vmovl.u8 q11, d7 \n"
+ "vmovl.u8 q8, d4 \n"
+ "vmovl.u8 q9, d5 \n"
+ "vmovl.u8 q10, d6 \n"
+ "vmovl.u8 q11, d7 \n"
// 3 * line_0 + line_1
- "vmlal.u8 q8, d0, d24 \n"
- "vmlal.u8 q9, d1, d24 \n"
- "vmlal.u8 q10, d2, d24 \n"
- "vmlal.u8 q11, d3, d24 \n"
+ "vmlal.u8 q8, d0, d24 \n"
+ "vmlal.u8 q9, d1, d24 \n"
+ "vmlal.u8 q10, d2, d24 \n"
+ "vmlal.u8 q11, d3, d24 \n"
// (3 * line_0 + line_1) >> 2
- "vqrshrn.u16 d0, q8, #2 \n"
- "vqrshrn.u16 d1, q9, #2 \n"
- "vqrshrn.u16 d2, q10, #2 \n"
- "vqrshrn.u16 d3, q11, #2 \n"
+ "vqrshrn.u16 d0, q8, #2 \n"
+ "vqrshrn.u16 d1, q9, #2 \n"
+ "vqrshrn.u16 d2, q10, #2 \n"
+ "vqrshrn.u16 d3, q11, #2 \n"
// a0 = (src[0] * 3 + s[1] * 1) >> 2
- "vmovl.u8 q8, d1 \n"
- "vmlal.u8 q8, d0, d24 \n"
- "vqrshrn.u16 d0, q8, #2 \n"
+ "vmovl.u8 q8, d1 \n"
+ "vmlal.u8 q8, d0, d24 \n"
+ "vqrshrn.u16 d0, q8, #2 \n"
// a1 = (src[1] * 1 + s[2] * 1) >> 1
- "vrhadd.u8 d1, d1, d2 \n"
+ "vrhadd.u8 d1, d1, d2 \n"
// a2 = (src[2] * 1 + s[3] * 3) >> 2
- "vmovl.u8 q8, d2 \n"
- "vmlal.u8 q8, d3, d24 \n"
- "vqrshrn.u16 d2, q8, #2 \n"
+ "vmovl.u8 q8, d2 \n"
+ "vmlal.u8 q8, d3, d24 \n"
+ "vqrshrn.u16 d2, q8, #2 \n"
- "vst3.8 {d0, d1, d2}, [%1]! \n"
+ "vst3.8 {d0, d1, d2}, [%1]! \n"
- "bgt 1b \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
@@ -230,31 +230,31 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "vmov.u8 d24, #3 \n"
- "add %3, %0 \n"
+ "vmov.u8 d24, #3 \n"
+ "add %3, %0 \n"
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
- "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
- "subs %2, %2, #24 \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
+ "subs %2, %2, #24 \n"
// average src line 0 with src line 1
- "vrhadd.u8 q0, q0, q2 \n"
- "vrhadd.u8 q1, q1, q3 \n"
+ "vrhadd.u8 q0, q0, q2 \n"
+ "vrhadd.u8 q1, q1, q3 \n"
// a0 = (src[0] * 3 + s[1] * 1) >> 2
- "vmovl.u8 q3, d1 \n"
- "vmlal.u8 q3, d0, d24 \n"
- "vqrshrn.u16 d0, q3, #2 \n"
+ "vmovl.u8 q3, d1 \n"
+ "vmlal.u8 q3, d0, d24 \n"
+ "vqrshrn.u16 d0, q3, #2 \n"
// a1 = (src[1] * 1 + s[2] * 1) >> 1
- "vrhadd.u8 d1, d1, d2 \n"
+ "vrhadd.u8 d1, d1, d2 \n"
// a2 = (src[2] * 1 + s[3] * 3) >> 2
- "vmovl.u8 q3, d2 \n"
- "vmlal.u8 q3, d3, d24 \n"
- "vqrshrn.u16 d2, q3, #2 \n"
+ "vmovl.u8 q3, d2 \n"
+ "vmlal.u8 q3, d3, d24 \n"
+ "vqrshrn.u16 d2, q3, #2 \n"
- "vst3.8 {d0, d1, d2}, [%1]! \n"
- "bgt 1b \n"
+ "vst3.8 {d0, d1, d2}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
@@ -282,15 +282,15 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
- "vld1.8 {q3}, [%3] \n"
+ "vld1.8 {q3}, [%3] \n"
"1: \n"
- "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
- "subs %2, %2, #12 \n"
- "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
- "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
- "vst1.8 {d4}, [%1]! \n"
- "vst1.32 {d5[0]}, [%1]! \n"
- "bgt 1b \n"
+ "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
+ "subs %2, %2, #12 \n"
+ "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
+ "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
+ "vst1.8 {d4}, [%1]! \n"
+ "vst1.32 {d5[0]}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -306,57 +306,57 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
asm volatile(
- "vld1.16 {q13}, [%5] \n"
- "vld1.8 {q14}, [%6] \n"
- "vld1.8 {q15}, [%7] \n"
- "add %3, %0 \n"
+ "vld1.16 {q13}, [%5] \n"
+ "vld1.8 {q14}, [%6] \n"
+ "vld1.8 {q15}, [%7] \n"
+ "add %3, %0 \n"
"1: \n"
// d0 = 00 40 01 41 02 42 03 43
// d1 = 10 50 11 51 12 52 13 53
// d2 = 20 60 21 61 22 62 23 63
// d3 = 30 70 31 71 32 72 33 73
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
- "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
- "vld4.8 {d16, d17, d18, d19}, [%4]! \n"
- "subs %2, %2, #12 \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
+ "vld4.8 {d16, d17, d18, d19}, [%4]! \n"
+ "subs %2, %2, #12 \n"
// Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
// d0 = 00 10 01 11 02 12 03 13
// d1 = 40 50 41 51 42 52 43 53
- "vtrn.u8 d0, d1 \n"
- "vtrn.u8 d4, d5 \n"
- "vtrn.u8 d16, d17 \n"
+ "vtrn.u8 d0, d1 \n"
+ "vtrn.u8 d4, d5 \n"
+ "vtrn.u8 d16, d17 \n"
// d2 = 20 30 21 31 22 32 23 33
// d3 = 60 70 61 71 62 72 63 73
- "vtrn.u8 d2, d3 \n"
- "vtrn.u8 d6, d7 \n"
- "vtrn.u8 d18, d19 \n"
+ "vtrn.u8 d2, d3 \n"
+ "vtrn.u8 d6, d7 \n"
+ "vtrn.u8 d18, d19 \n"
// d0 = 00+10 01+11 02+12 03+13
// d2 = 40+50 41+51 42+52 43+53
- "vpaddl.u8 q0, q0 \n"
- "vpaddl.u8 q2, q2 \n"
- "vpaddl.u8 q8, q8 \n"
+ "vpaddl.u8 q0, q0 \n"
+ "vpaddl.u8 q2, q2 \n"
+ "vpaddl.u8 q8, q8 \n"
// d3 = 60+70 61+71 62+72 63+73
- "vpaddl.u8 d3, d3 \n"
- "vpaddl.u8 d7, d7 \n"
- "vpaddl.u8 d19, d19 \n"
+ "vpaddl.u8 d3, d3 \n"
+ "vpaddl.u8 d7, d7 \n"
+ "vpaddl.u8 d19, d19 \n"
// combine source lines
- "vadd.u16 q0, q2 \n"
- "vadd.u16 q0, q8 \n"
- "vadd.u16 d4, d3, d7 \n"
- "vadd.u16 d4, d19 \n"
+ "vadd.u16 q0, q2 \n"
+ "vadd.u16 q0, q8 \n"
+ "vadd.u16 d4, d3, d7 \n"
+ "vadd.u16 d4, d19 \n"
// dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
// + s[6 + st * 1] + s[7 + st * 1]
// + s[6 + st * 2] + s[7 + st * 2]) / 6
"vqrdmulh.s16 q2, q2, q13 \n"
- "vmovn.u16 d4, q2 \n"
+ "vmovn.u16 d4, q2 \n"
// Shuffle 2,3 reg around so that 2 can be added to the
// 0,1 reg and 3 can be added to the 4,5 reg. This
@@ -364,24 +364,24 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
// registers are already expanded. Then do transposes
// to get aligned.
// q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
- "vmovl.u8 q1, d2 \n"
- "vmovl.u8 q3, d6 \n"
- "vmovl.u8 q9, d18 \n"
+ "vmovl.u8 q1, d2 \n"
+ "vmovl.u8 q3, d6 \n"
+ "vmovl.u8 q9, d18 \n"
// combine source lines
- "vadd.u16 q1, q3 \n"
- "vadd.u16 q1, q9 \n"
+ "vadd.u16 q1, q3 \n"
+ "vadd.u16 q1, q9 \n"
// d4 = xx 20 xx 30 xx 22 xx 32
// d5 = xx 21 xx 31 xx 23 xx 33
- "vtrn.u32 d2, d3 \n"
+ "vtrn.u32 d2, d3 \n"
// d4 = xx 20 xx 21 xx 22 xx 23
// d5 = xx 30 xx 31 xx 32 xx 33
- "vtrn.u16 d2, d3 \n"
+ "vtrn.u16 d2, d3 \n"
// 0+1+2, 3+4+5
- "vadd.u16 q0, q1 \n"
+ "vadd.u16 q0, q1 \n"
// Need to divide, but can't downshift as the the value
// isn't a power of 2. So multiply by 65536 / n
@@ -390,14 +390,14 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
// Align for table lookup, vtbl requires registers to
// be adjacent
- "vmov.u8 d2, d4 \n"
+ "vmov.u8 d2, d4 \n"
- "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
- "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
+ "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
+ "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
- "vst1.8 {d3}, [%1]! \n"
- "vst1.32 {d4[0]}, [%1]! \n"
- "bgt 1b \n"
+ "vst1.8 {d3}, [%1]! \n"
+ "vst1.32 {d4[0]}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
@@ -416,46 +416,46 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "vld1.16 {q13}, [%4] \n"
- "vld1.8 {q14}, [%5] \n"
- "add %3, %0 \n"
+ "vld1.16 {q13}, [%4] \n"
+ "vld1.8 {q14}, [%5] \n"
+ "add %3, %0 \n"
"1: \n"
// d0 = 00 40 01 41 02 42 03 43
// d1 = 10 50 11 51 12 52 13 53
// d2 = 20 60 21 61 22 62 23 63
// d3 = 30 70 31 71 32 72 33 73
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
- "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
- "subs %2, %2, #12 \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
+ "subs %2, %2, #12 \n"
// Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
// d0 = 00 10 01 11 02 12 03 13
// d1 = 40 50 41 51 42 52 43 53
- "vtrn.u8 d0, d1 \n"
- "vtrn.u8 d4, d5 \n"
+ "vtrn.u8 d0, d1 \n"
+ "vtrn.u8 d4, d5 \n"
// d2 = 20 30 21 31 22 32 23 33
// d3 = 60 70 61 71 62 72 63 73
- "vtrn.u8 d2, d3 \n"
- "vtrn.u8 d6, d7 \n"
+ "vtrn.u8 d2, d3 \n"
+ "vtrn.u8 d6, d7 \n"
// d0 = 00+10 01+11 02+12 03+13
// d2 = 40+50 41+51 42+52 43+53
- "vpaddl.u8 q0, q0 \n"
- "vpaddl.u8 q2, q2 \n"
+ "vpaddl.u8 q0, q0 \n"
+ "vpaddl.u8 q2, q2 \n"
// d3 = 60+70 61+71 62+72 63+73
- "vpaddl.u8 d3, d3 \n"
- "vpaddl.u8 d7, d7 \n"
+ "vpaddl.u8 d3, d3 \n"
+ "vpaddl.u8 d7, d7 \n"
// combine source lines
- "vadd.u16 q0, q2 \n"
- "vadd.u16 d4, d3, d7 \n"
+ "vadd.u16 q0, q2 \n"
+ "vadd.u16 d4, d3, d7 \n"
// dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
- "vqrshrn.u16 d4, q2, #2 \n"
+ "vqrshrn.u16 d4, q2, #2 \n"
// Shuffle 2,3 reg around so that 2 can be added to the
// 0,1 reg and 3 can be added to the 4,5 reg. This
@@ -463,22 +463,22 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
// registers are already expanded. Then do transposes
// to get aligned.
// q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
- "vmovl.u8 q1, d2 \n"
- "vmovl.u8 q3, d6 \n"
+ "vmovl.u8 q1, d2 \n"
+ "vmovl.u8 q3, d6 \n"
// combine source lines
- "vadd.u16 q1, q3 \n"
+ "vadd.u16 q1, q3 \n"
// d4 = xx 20 xx 30 xx 22 xx 32
// d5 = xx 21 xx 31 xx 23 xx 33
- "vtrn.u32 d2, d3 \n"
+ "vtrn.u32 d2, d3 \n"
// d4 = xx 20 xx 21 xx 22 xx 23
// d5 = xx 30 xx 31 xx 32 xx 33
- "vtrn.u16 d2, d3 \n"
+ "vtrn.u16 d2, d3 \n"
// 0+1+2, 3+4+5
- "vadd.u16 q0, q1 \n"
+ "vadd.u16 q0, q1 \n"
// Need to divide, but can't downshift as the the value
// isn't a power of 2. So multiply by 65536 / n
@@ -487,14 +487,14 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
// Align for table lookup, vtbl requires registers to
// be adjacent
- "vmov.u8 d2, d4 \n"
+ "vmov.u8 d2, d4 \n"
- "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
- "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
+ "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
+ "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
- "vst1.8 {d3}, [%1]! \n"
- "vst1.32 {d4[0]}, [%1]! \n"
- "bgt 1b \n"
+ "vst1.8 {d3}, [%1]! \n"
+ "vst1.32 {d4[0]}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
@@ -511,13 +511,13 @@ void ScaleAddRow_NEON(const uint8_t* src_ptr,
int src_width) {
asm volatile(
"1: \n"
- "vld1.16 {q1, q2}, [%1] \n" // load accumulator
- "vld1.8 {q0}, [%0]! \n" // load 16 bytes
- "vaddw.u8 q2, q2, d1 \n" // add
- "vaddw.u8 q1, q1, d0 \n"
- "vst1.16 {q1, q2}, [%1]! \n" // store accumulator
- "subs %2, %2, #16 \n" // 16 processed per loop
- "bgt 1b \n"
+ "vld1.16 {q1, q2}, [%1] \n" // load accumulator
+ "vld1.8 {q0}, [%0]! \n" // load 16 bytes
+ "vaddw.u8 q2, q2, d1 \n" // add
+ "vaddw.u8 q1, q1, d0 \n"
+ "vst1.16 {q1, q2}, [%1]! \n" // store accumulator
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(src_width) // %2
@@ -547,17 +547,17 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr,
int* tmp = dx_offset;
const uint8_t* src_tmp = src_ptr;
asm volatile (
- "vdup.32 q0, %3 \n" // x
- "vdup.32 q1, %4 \n" // dx
- "vld1.32 {q2}, [%5] \n" // 0 1 2 3
- "vshl.i32 q3, q1, #2 \n" // 4 * dx
- "vmul.s32 q1, q1, q2 \n"
+ "vdup.32 q0, %3 \n" // x
+ "vdup.32 q1, %4 \n" // dx
+ "vld1.32 {q2}, [%5] \n" // 0 1 2 3
+ "vshl.i32 q3, q1, #2 \n" // 4 * dx
+ "vmul.s32 q1, q1, q2 \n"
// x , x + 1 * dx, x + 2 * dx, x + 3 * dx
- "vadd.s32 q1, q1, q0 \n"
+ "vadd.s32 q1, q1, q0 \n"
// x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
- "vadd.s32 q2, q1, q3 \n"
- "vshl.i32 q0, q3, #1 \n" // 8 * dx
- "1: \n"
+ "vadd.s32 q2, q1, q3 \n"
+ "vshl.i32 q0, q3, #1 \n" // 8 * dx
+ "1: \n"
LOAD2_DATA8_LANE(0)
LOAD2_DATA8_LANE(1)
LOAD2_DATA8_LANE(2)
@@ -566,27 +566,27 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr,
LOAD2_DATA8_LANE(5)
LOAD2_DATA8_LANE(6)
LOAD2_DATA8_LANE(7)
- "vmov q10, q1 \n"
- "vmov q11, q2 \n"
- "vuzp.16 q10, q11 \n"
- "vmovl.u8 q8, d6 \n"
- "vmovl.u8 q9, d7 \n"
- "vsubl.s16 q11, d18, d16 \n"
- "vsubl.s16 q12, d19, d17 \n"
- "vmovl.u16 q13, d20 \n"
- "vmovl.u16 q10, d21 \n"
- "vmul.s32 q11, q11, q13 \n"
- "vmul.s32 q12, q12, q10 \n"
- "vrshrn.s32 d18, q11, #16 \n"
- "vrshrn.s32 d19, q12, #16 \n"
- "vadd.s16 q8, q8, q9 \n"
- "vmovn.s16 d6, q8 \n"
+ "vmov q10, q1 \n"
+ "vmov q11, q2 \n"
+ "vuzp.16 q10, q11 \n"
+ "vmovl.u8 q8, d6 \n"
+ "vmovl.u8 q9, d7 \n"
+ "vsubl.s16 q11, d18, d16 \n"
+ "vsubl.s16 q12, d19, d17 \n"
+ "vmovl.u16 q13, d20 \n"
+ "vmovl.u16 q10, d21 \n"
+ "vmul.s32 q11, q11, q13 \n"
+ "vmul.s32 q12, q12, q10 \n"
+ "vrshrn.s32 d18, q11, #16 \n"
+ "vrshrn.s32 d19, q12, #16 \n"
+ "vadd.s16 q8, q8, q9 \n"
+ "vmovn.s16 d6, q8 \n"
- "vst1.8 {d6}, [%0]! \n" // store pixels
- "vadd.s32 q1, q1, q0 \n"
- "vadd.s32 q2, q2, q0 \n"
- "subs %2, %2, #8 \n" // 8 processed per loop
- "bgt 1b \n"
+ "vst1.8 {d6}, [%0]! \n" // store pixels
+ "vadd.s32 q1, q1, q0 \n"
+ "vadd.s32 q2, q2, q0 \n"
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "bgt 1b \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(dst_width), // %2
@@ -609,75 +609,75 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
int dst_width,
int source_y_fraction) {
asm volatile(
- "cmp %4, #0 \n"
- "beq 100f \n"
- "add %2, %1 \n"
- "cmp %4, #64 \n"
- "beq 75f \n"
- "cmp %4, #128 \n"
- "beq 50f \n"
- "cmp %4, #192 \n"
- "beq 25f \n"
+ "cmp %4, #0 \n"
+ "beq 100f \n"
+ "add %2, %1 \n"
+ "cmp %4, #64 \n"
+ "beq 75f \n"
+ "cmp %4, #128 \n"
+ "beq 50f \n"
+ "cmp %4, #192 \n"
+ "beq 25f \n"
- "vdup.8 d5, %4 \n"
- "rsb %4, #256 \n"
- "vdup.8 d4, %4 \n"
+ "vdup.8 d5, %4 \n"
+ "rsb %4, #256 \n"
+ "vdup.8 d4, %4 \n"
// General purpose row blend.
"1: \n"
- "vld1.8 {q0}, [%1]! \n"
- "vld1.8 {q1}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vmull.u8 q13, d0, d4 \n"
- "vmull.u8 q14, d1, d4 \n"
- "vmlal.u8 q13, d2, d5 \n"
- "vmlal.u8 q14, d3, d5 \n"
- "vrshrn.u16 d0, q13, #8 \n"
- "vrshrn.u16 d1, q14, #8 \n"
- "vst1.8 {q0}, [%0]! \n"
- "bgt 1b \n"
- "b 99f \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vmull.u8 q13, d0, d4 \n"
+ "vmull.u8 q14, d1, d4 \n"
+ "vmlal.u8 q13, d2, d5 \n"
+ "vmlal.u8 q14, d3, d5 \n"
+ "vrshrn.u16 d0, q13, #8 \n"
+ "vrshrn.u16 d1, q14, #8 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 1b \n"
+ "b 99f \n"
// Blend 25 / 75.
"25: \n"
- "vld1.8 {q0}, [%1]! \n"
- "vld1.8 {q1}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vrhadd.u8 q0, q1 \n"
- "vrhadd.u8 q0, q1 \n"
- "vst1.8 {q0}, [%0]! \n"
- "bgt 25b \n"
- "b 99f \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 25b \n"
+ "b 99f \n"
// Blend 50 / 50.
"50: \n"
- "vld1.8 {q0}, [%1]! \n"
- "vld1.8 {q1}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vrhadd.u8 q0, q1 \n"
- "vst1.8 {q0}, [%0]! \n"
- "bgt 50b \n"
- "b 99f \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 50b \n"
+ "b 99f \n"
// Blend 75 / 25.
"75: \n"
- "vld1.8 {q1}, [%1]! \n"
- "vld1.8 {q0}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vrhadd.u8 q0, q1 \n"
- "vrhadd.u8 q0, q1 \n"
- "vst1.8 {q0}, [%0]! \n"
- "bgt 75b \n"
- "b 99f \n"
+ "vld1.8 {q1}, [%1]! \n"
+ "vld1.8 {q0}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 75b \n"
+ "b 99f \n"
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
- "vld1.8 {q0}, [%1]! \n"
- "subs %3, %3, #16 \n"
- "vst1.8 {q0}, [%0]! \n"
- "bgt 100b \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "subs %3, %3, #16 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 100b \n"
"99: \n"
- "vst1.8 {d1[7]}, [%0] \n"
+ "vst1.8 {d1[7]}, [%0] \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(src_stride), // %2
@@ -694,12 +694,12 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
(void)src_stride;
asm volatile(
"1: \n"
- "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
- "subs %2, %2, #8 \n" // 8 processed per loop
- "vmov q2, q1 \n" // load next 8 ARGB
- "vst2.32 {q2, q3}, [%1]! \n" // store odd pixels
- "bgt 1b \n"
+ "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "vmov q2, q1 \n" // load next 8 ARGB
+ "vst2.32 {q2, q3}, [%1]! \n" // store odd pixels
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
"+r"(dst_width) // %2
@@ -722,13 +722,13 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
(void)src_stride;
asm volatile(
"1: \n"
- "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
- "subs %2, %2, #8 \n" // 8 processed per loop
- "vrhadd.u8 q0, q0, q1 \n" // rounding half add
- "vrhadd.u8 q1, q2, q3 \n" // rounding half add
- "vst2.32 {q0, q1}, [%1]! \n"
- "bgt 1b \n"
+ "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "vrhadd.u8 q0, q0, q1 \n" // rounding half add
+ "vrhadd.u8 q1, q2, q3 \n" // rounding half add
+ "vst2.32 {q0, q1}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
@@ -743,27 +743,27 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
int dst_width) {
asm volatile(
// change the stride to row 2 pointer
- "add %1, %1, %0 \n"
+ "add %1, %1, %0 \n"
"1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
- "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
- "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB
- "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB
- "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
- "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
- "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes
- "vrshrn.u16 d1, q1, #2 \n"
- "vrshrn.u16 d2, q2, #2 \n"
- "vrshrn.u16 d3, q3, #2 \n"
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n"
- "bgt 1b \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
+ "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB
+ "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB
+ "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
+ "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
+ "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes
+ "vrshrn.u16 d1, q1, #2 \n"
+ "vrshrn.u16 d2, q2, #2 \n"
+ "vrshrn.u16 d3, q3, #2 \n"
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
"+r"(dst), // %2
@@ -781,15 +781,15 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
int dst_width) {
(void)src_stride;
asm volatile(
- "mov r12, %3, lsl #2 \n"
+ "mov r12, %3, lsl #2 \n"
"1: \n"
- "vld1.32 {d0[0]}, [%0], r12 \n"
- "vld1.32 {d0[1]}, [%0], r12 \n"
- "vld1.32 {d1[0]}, [%0], r12 \n"
- "vld1.32 {d1[1]}, [%0], r12 \n"
- "subs %2, %2, #4 \n" // 4 pixels per loop.
- "vst1.8 {q0}, [%1]! \n"
- "bgt 1b \n"
+ "vld1.32 {d0[0]}, [%0], r12 \n"
+ "vld1.32 {d0[1]}, [%0], r12 \n"
+ "vld1.32 {d1[0]}, [%0], r12 \n"
+ "vld1.32 {d1[1]}, [%0], r12 \n"
+ "subs %2, %2, #4 \n" // 4 pixels per loop.
+ "vst1.8 {q0}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
@@ -805,30 +805,30 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
uint8_t* dst_argb,
int dst_width) {
asm volatile(
- "mov r12, %4, lsl #2 \n"
- "add %1, %1, %0 \n"
+ "mov r12, %4, lsl #2 \n"
+ "add %1, %1, %0 \n"
"1: \n"
- "vld1.8 {d0}, [%0], r12 \n" // 4 2x2 blocks -> 2x1
- "vld1.8 {d1}, [%1], r12 \n"
- "vld1.8 {d2}, [%0], r12 \n"
- "vld1.8 {d3}, [%1], r12 \n"
- "vld1.8 {d4}, [%0], r12 \n"
- "vld1.8 {d5}, [%1], r12 \n"
- "vld1.8 {d6}, [%0], r12 \n"
- "vld1.8 {d7}, [%1], r12 \n"
- "vaddl.u8 q0, d0, d1 \n"
- "vaddl.u8 q1, d2, d3 \n"
- "vaddl.u8 q2, d4, d5 \n"
- "vaddl.u8 q3, d6, d7 \n"
- "vswp.8 d1, d2 \n" // ab_cd -> ac_bd
- "vswp.8 d5, d6 \n" // ef_gh -> eg_fh
- "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d)
- "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h)
- "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
- "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
- "subs %3, %3, #4 \n" // 4 pixels per loop.
- "vst1.8 {q0}, [%2]! \n"
- "bgt 1b \n"
+ "vld1.8 {d0}, [%0], r12 \n" // 4 2x2 blocks -> 2x1
+ "vld1.8 {d1}, [%1], r12 \n"
+ "vld1.8 {d2}, [%0], r12 \n"
+ "vld1.8 {d3}, [%1], r12 \n"
+ "vld1.8 {d4}, [%0], r12 \n"
+ "vld1.8 {d5}, [%1], r12 \n"
+ "vld1.8 {d6}, [%0], r12 \n"
+ "vld1.8 {d7}, [%1], r12 \n"
+ "vaddl.u8 q0, d0, d1 \n"
+ "vaddl.u8 q1, d2, d3 \n"
+ "vaddl.u8 q2, d4, d5 \n"
+ "vaddl.u8 q3, d6, d7 \n"
+ "vswp.8 d1, d2 \n" // ab_cd -> ac_bd
+ "vswp.8 d5, d6 \n" // ef_gh -> eg_fh
+ "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d)
+ "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h)
+ "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
+ "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
+ "subs %3, %3, #4 \n" // 4 pixels per loop.
+ "vst1.8 {q0}, [%2]! \n"
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(src_stride), // %1
"+r"(dst_argb), // %2
@@ -865,8 +865,8 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb,
LOAD1_DATA32_LANE(d3, 1)
// clang-format on
"vst1.32 {q0, q1}, [%0]! \n" // store pixels
- "subs %2, %2, #8 \n" // 8 processed per loop
- "bgt 1b \n"
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "bgt 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
"+r"(dst_width), // %2
@@ -897,16 +897,16 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
int* tmp = dx_offset;
const uint8_t* src_tmp = src_argb;
asm volatile (
- "vdup.32 q0, %3 \n" // x
- "vdup.32 q1, %4 \n" // dx
- "vld1.32 {q2}, [%5] \n" // 0 1 2 3
- "vshl.i32 q9, q1, #2 \n" // 4 * dx
- "vmul.s32 q1, q1, q2 \n"
- "vmov.i8 q3, #0x7f \n" // 0x7F
- "vmov.i16 q15, #0x7f \n" // 0x7F
+ "vdup.32 q0, %3 \n" // x
+ "vdup.32 q1, %4 \n" // dx
+ "vld1.32 {q2}, [%5] \n" // 0 1 2 3
+ "vshl.i32 q9, q1, #2 \n" // 4 * dx
+ "vmul.s32 q1, q1, q2 \n"
+ "vmov.i8 q3, #0x7f \n" // 0x7F
+ "vmov.i16 q15, #0x7f \n" // 0x7F
// x , x + 1 * dx, x + 2 * dx, x + 3 * dx
- "vadd.s32 q8, q1, q0 \n"
- "1: \n"
+ "vadd.s32 q8, q1, q0 \n"
+ "1: \n"
// d0, d1: a
// d2, d3: b
LOAD2_DATA32_LANE(d0, d2, 0)
@@ -950,6 +950,64 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
#undef LOAD2_DATA32_LANE
+void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ asm volatile(
+ // change the stride to row 2 pointer
+ "add %1, %1, %0 \n"
+ "1: \n"
+ "vld2.8 {d0, d2}, [%0]! \n" // load 8 UV pixels.
+ "vld2.8 {d1, d3}, [%0]! \n" // load next 8 UV
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vpaddl.u8 q0, q0 \n" // U 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // V 16 bytes -> 8 shorts.
+ "vld2.8 {d16, d18}, [%1]! \n" // load 8 more UV
+ "vld2.8 {d17, d19}, [%1]! \n" // load last 8 UV
+ "vpadal.u8 q0, q8 \n" // U 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q9 \n" // V 16 bytes -> 8 shorts.
+ "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes
+ "vrshrn.u16 d1, q1, #2 \n"
+ "vst2.8 {d0, d1}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ :
+ : "memory", "cc", "q0", "q1", "q8", "q9");
+}
+
+// Reads 4 pixels at a time.
+void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx, // pixel step
+ uint8_t* dst_ptr,
+ int dst_width) {
+ const uint8_t* src1_ptr = src_ptr + src_stepx * 2;
+ const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
+ const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ "vld1.16 {d0[0]}, [%0], %6 \n"
+ "vld1.16 {d0[1]}, [%1], %6 \n"
+ "vld1.16 {d0[2]}, [%2], %6 \n"
+ "vld1.16 {d0[3]}, [%3], %6 \n"
+ "subs %5, %5, #4 \n" // 4 pixels per loop.
+ "vst1.8 {d0}, [%4]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src1_ptr), // %1
+ "+r"(src2_ptr), // %2
+ "+r"(src3_ptr), // %3
+ "+r"(dst_ptr), // %4
+ "+r"(dst_width) // %5
+ : "r"(src_stepx * 8) // %6
+ : "memory", "cc", "d0");
+}
+
#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
#ifdef __cplusplus
diff --git a/TMessagesProj/jni/third_party/libyuv/source/scale_neon64.cc b/TMessagesProj/jni/third_party/libyuv/source/scale_neon64.cc
index 0a7b80ce1..185591cb5 100644
--- a/TMessagesProj/jni/third_party/libyuv/source/scale_neon64.cc
+++ b/TMessagesProj/jni/third_party/libyuv/source/scale_neon64.cc
@@ -29,10 +29,11 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
asm volatile(
"1: \n"
// load even pixels into v0, odd into v1
- "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
- "subs %w2, %w2, #16 \n" // 16 processed per loop
- "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
- "b.gt 1b \n"
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
+ "subs %w2, %w2, #16 \n" // 16 processed per loop
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
"+r"(dst_width) // %2
@@ -50,11 +51,12 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
asm volatile(
"1: \n"
// load even pixels into v0, odd into v1
- "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
- "subs %w2, %w2, #16 \n" // 16 processed per loop
- "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
- "st1 {v0.16b}, [%1], #16 \n"
- "b.gt 1b \n"
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
+ "subs %w2, %w2, #16 \n" // 16 processed per loop
+ "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st1 {v0.16b}, [%1], #16 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
"+r"(dst_width) // %2
@@ -70,19 +72,21 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
int dst_width) {
asm volatile(
// change the stride to row 2 pointer
- "add %1, %1, %0 \n"
+ "add %1, %1, %0 \n"
"1: \n"
- "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc
- "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc
- "subs %w3, %w3, #16 \n" // 16 processed per loop
- "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent
- "uaddlp v1.8h, v1.16b \n"
- "uadalp v0.8h, v2.16b \n" // += row 2 add adjacent
- "uadalp v1.8h, v3.16b \n"
- "rshrn v0.8b, v0.8h, #2 \n" // round and pack
- "rshrn2 v0.16b, v1.8h, #2 \n"
- "st1 {v0.16b}, [%2], #16 \n"
- "b.gt 1b \n"
+ "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc
+ "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc
+ "subs %w3, %w3, #16 \n" // 16 processed per loop
+ "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "uaddlp v1.8h, v1.16b \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v0.8h, v2.16b \n" // += row 2 add adjacent
+ "uadalp v1.8h, v3.16b \n"
+ "rshrn v0.8b, v0.8h, #2 \n" // round and pack
+ "rshrn2 v0.16b, v1.8h, #2 \n"
+ "st1 {v0.16b}, [%2], #16 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
"+r"(dst), // %2
@@ -99,10 +103,11 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr,
(void)src_stride;
asm volatile(
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
- "subs %w2, %w2, #8 \n" // 8 processed per loop
- "st1 {v2.8b}, [%1], #8 \n"
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st1 {v2.8b}, [%1], #8 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -119,19 +124,23 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
asm volatile(
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4
- "ld1 {v1.16b}, [%2], #16 \n"
- "ld1 {v2.16b}, [%3], #16 \n"
- "ld1 {v3.16b}, [%4], #16 \n"
- "subs %w5, %w5, #4 \n"
- "uaddlp v0.8h, v0.16b \n"
- "uadalp v0.8h, v1.16b \n"
- "uadalp v0.8h, v2.16b \n"
- "uadalp v0.8h, v3.16b \n"
- "addp v0.8h, v0.8h, v0.8h \n"
- "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding
- "st1 {v0.s}[0], [%1], #4 \n"
- "b.gt 1b \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "ld1 {v2.16b}, [%3], #16 \n"
+ "ld1 {v3.16b}, [%4], #16 \n"
+ "subs %w5, %w5, #4 \n"
+ "uaddlp v0.8h, v0.16b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "uadalp v0.8h, v1.16b \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "uadalp v0.8h, v2.16b \n"
+ "prfm pldl1keep, [%3, 448] \n"
+ "uadalp v0.8h, v3.16b \n"
+ "prfm pldl1keep, [%4, 448] \n"
+ "addp v0.8h, v0.8h, v0.8h \n"
+ "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding
+ "st1 {v0.s}[0], [%1], #4 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(src_ptr1), // %2
@@ -151,12 +160,13 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
- "subs %w2, %w2, #24 \n"
- "orr v2.16b, v3.16b, v3.16b \n" // order v0,v1,v2
- "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
- "b.gt 1b \n"
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
+ "subs %w2, %w2, #24 \n"
+ "orr v2.16b, v3.16b, v3.16b \n" // order v0,v1,v2
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -169,49 +179,51 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "movi v20.8b, #3 \n"
- "add %3, %3, %0 \n"
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
- "subs %w2, %w2, #24 \n"
+ "movi v20.8b, #3 \n"
+ "add %3, %3, %0 \n"
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
+ "subs %w2, %w2, #24 \n"
// filter src line 0 with src line 1
// expand chars to shorts to allow for room
// when adding lines together
- "ushll v16.8h, v4.8b, #0 \n"
- "ushll v17.8h, v5.8b, #0 \n"
- "ushll v18.8h, v6.8b, #0 \n"
- "ushll v19.8h, v7.8b, #0 \n"
+ "ushll v16.8h, v4.8b, #0 \n"
+ "ushll v17.8h, v5.8b, #0 \n"
+ "ushll v18.8h, v6.8b, #0 \n"
+ "ushll v19.8h, v7.8b, #0 \n"
// 3 * line_0 + line_1
- "umlal v16.8h, v0.8b, v20.8b \n"
- "umlal v17.8h, v1.8b, v20.8b \n"
- "umlal v18.8h, v2.8b, v20.8b \n"
- "umlal v19.8h, v3.8b, v20.8b \n"
+ "umlal v16.8h, v0.8b, v20.8b \n"
+ "umlal v17.8h, v1.8b, v20.8b \n"
+ "umlal v18.8h, v2.8b, v20.8b \n"
+ "umlal v19.8h, v3.8b, v20.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
// (3 * line_0 + line_1) >> 2
- "uqrshrn v0.8b, v16.8h, #2 \n"
- "uqrshrn v1.8b, v17.8h, #2 \n"
- "uqrshrn v2.8b, v18.8h, #2 \n"
- "uqrshrn v3.8b, v19.8h, #2 \n"
+ "uqrshrn v0.8b, v16.8h, #2 \n"
+ "uqrshrn v1.8b, v17.8h, #2 \n"
+ "uqrshrn v2.8b, v18.8h, #2 \n"
+ "uqrshrn v3.8b, v19.8h, #2 \n"
+ "prfm pldl1keep, [%3, 448] \n"
// a0 = (src[0] * 3 + s[1] * 1) >> 2
- "ushll v16.8h, v1.8b, #0 \n"
- "umlal v16.8h, v0.8b, v20.8b \n"
- "uqrshrn v0.8b, v16.8h, #2 \n"
+ "ushll v16.8h, v1.8b, #0 \n"
+ "umlal v16.8h, v0.8b, v20.8b \n"
+ "uqrshrn v0.8b, v16.8h, #2 \n"
// a1 = (src[1] * 1 + s[2] * 1) >> 1
- "urhadd v1.8b, v1.8b, v2.8b \n"
+ "urhadd v1.8b, v1.8b, v2.8b \n"
// a2 = (src[2] * 1 + s[3] * 3) >> 2
- "ushll v16.8h, v2.8b, #0 \n"
- "umlal v16.8h, v3.8b, v20.8b \n"
- "uqrshrn v2.8b, v16.8h, #2 \n"
+ "ushll v16.8h, v2.8b, #0 \n"
+ "umlal v16.8h, v3.8b, v20.8b \n"
+ "uqrshrn v2.8b, v16.8h, #2 \n"
- "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
+ "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
- "b.gt 1b \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
@@ -226,33 +238,35 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "movi v20.8b, #3 \n"
- "add %3, %3, %0 \n"
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
- "subs %w2, %w2, #24 \n"
+ "movi v20.8b, #3 \n"
+ "add %3, %3, %0 \n"
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
+ "subs %w2, %w2, #24 \n"
// average src line 0 with src line 1
- "urhadd v0.8b, v0.8b, v4.8b \n"
- "urhadd v1.8b, v1.8b, v5.8b \n"
- "urhadd v2.8b, v2.8b, v6.8b \n"
- "urhadd v3.8b, v3.8b, v7.8b \n"
+ "urhadd v0.8b, v0.8b, v4.8b \n"
+ "urhadd v1.8b, v1.8b, v5.8b \n"
+ "urhadd v2.8b, v2.8b, v6.8b \n"
+ "urhadd v3.8b, v3.8b, v7.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
// a0 = (src[0] * 3 + s[1] * 1) >> 2
- "ushll v4.8h, v1.8b, #0 \n"
- "umlal v4.8h, v0.8b, v20.8b \n"
- "uqrshrn v0.8b, v4.8h, #2 \n"
+ "ushll v4.8h, v1.8b, #0 \n"
+ "umlal v4.8h, v0.8b, v20.8b \n"
+ "uqrshrn v0.8b, v4.8h, #2 \n"
+ "prfm pldl1keep, [%3, 448] \n"
// a1 = (src[1] * 1 + s[2] * 1) >> 1
- "urhadd v1.8b, v1.8b, v2.8b \n"
+ "urhadd v1.8b, v1.8b, v2.8b \n"
// a2 = (src[2] * 1 + s[3] * 3) >> 2
- "ushll v4.8h, v2.8b, #0 \n"
- "umlal v4.8h, v3.8b, v20.8b \n"
- "uqrshrn v2.8b, v4.8h, #2 \n"
+ "ushll v4.8h, v2.8b, #0 \n"
+ "umlal v4.8h, v3.8b, v20.8b \n"
+ "uqrshrn v2.8b, v4.8h, #2 \n"
- "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
- "b.gt 1b \n"
+ "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
@@ -279,14 +293,15 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
- "ld1 {v3.16b}, [%3] \n"
- "1: \n"
- "ld1 {v0.16b,v1.16b}, [%0], #32 \n"
- "subs %w2, %w2, #12 \n"
- "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
- "st1 {v2.8b}, [%1], #8 \n"
- "st1 {v2.s}[2], [%1], #4 \n"
- "b.gt 1b \n"
+ "ld1 {v3.16b}, [%3] \n"
+ "1: \n"
+ "ld1 {v0.16b,v1.16b}, [%0], #32 \n"
+ "subs %w2, %w2, #12 \n"
+ "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st1 {v2.8b}, [%1], #8 \n"
+ "st1 {v2.s}[2], [%1], #4 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -303,68 +318,68 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
ptrdiff_t tmp_src_stride = src_stride;
asm volatile(
- "ld1 {v29.8h}, [%5] \n"
- "ld1 {v30.16b}, [%6] \n"
- "ld1 {v31.8h}, [%7] \n"
- "add %2, %2, %0 \n"
- "1: \n"
+ "ld1 {v29.8h}, [%5] \n"
+ "ld1 {v30.16b}, [%6] \n"
+ "ld1 {v31.8h}, [%7] \n"
+ "add %2, %2, %0 \n"
+ "1: \n"
// 00 40 01 41 02 42 03 43
// 10 50 11 51 12 52 13 53
// 20 60 21 61 22 62 23 63
// 30 70 31 71 32 72 33 73
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
- "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
- "subs %w4, %w4, #12 \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
+ "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
+ "subs %w4, %w4, #12 \n"
// Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
// 00 10 01 11 02 12 03 13
// 40 50 41 51 42 52 43 53
- "trn1 v20.8b, v0.8b, v1.8b \n"
- "trn2 v21.8b, v0.8b, v1.8b \n"
- "trn1 v22.8b, v4.8b, v5.8b \n"
- "trn2 v23.8b, v4.8b, v5.8b \n"
- "trn1 v24.8b, v16.8b, v17.8b \n"
- "trn2 v25.8b, v16.8b, v17.8b \n"
+ "trn1 v20.8b, v0.8b, v1.8b \n"
+ "trn2 v21.8b, v0.8b, v1.8b \n"
+ "trn1 v22.8b, v4.8b, v5.8b \n"
+ "trn2 v23.8b, v4.8b, v5.8b \n"
+ "trn1 v24.8b, v16.8b, v17.8b \n"
+ "trn2 v25.8b, v16.8b, v17.8b \n"
// 20 30 21 31 22 32 23 33
// 60 70 61 71 62 72 63 73
- "trn1 v0.8b, v2.8b, v3.8b \n"
- "trn2 v1.8b, v2.8b, v3.8b \n"
- "trn1 v4.8b, v6.8b, v7.8b \n"
- "trn2 v5.8b, v6.8b, v7.8b \n"
- "trn1 v16.8b, v18.8b, v19.8b \n"
- "trn2 v17.8b, v18.8b, v19.8b \n"
+ "trn1 v0.8b, v2.8b, v3.8b \n"
+ "trn2 v1.8b, v2.8b, v3.8b \n"
+ "trn1 v4.8b, v6.8b, v7.8b \n"
+ "trn2 v5.8b, v6.8b, v7.8b \n"
+ "trn1 v16.8b, v18.8b, v19.8b \n"
+ "trn2 v17.8b, v18.8b, v19.8b \n"
// 00+10 01+11 02+12 03+13
// 40+50 41+51 42+52 43+53
- "uaddlp v20.4h, v20.8b \n"
- "uaddlp v21.4h, v21.8b \n"
- "uaddlp v22.4h, v22.8b \n"
- "uaddlp v23.4h, v23.8b \n"
- "uaddlp v24.4h, v24.8b \n"
- "uaddlp v25.4h, v25.8b \n"
+ "uaddlp v20.4h, v20.8b \n"
+ "uaddlp v21.4h, v21.8b \n"
+ "uaddlp v22.4h, v22.8b \n"
+ "uaddlp v23.4h, v23.8b \n"
+ "uaddlp v24.4h, v24.8b \n"
+ "uaddlp v25.4h, v25.8b \n"
// 60+70 61+71 62+72 63+73
- "uaddlp v1.4h, v1.8b \n"
- "uaddlp v5.4h, v5.8b \n"
- "uaddlp v17.4h, v17.8b \n"
+ "uaddlp v1.4h, v1.8b \n"
+ "uaddlp v5.4h, v5.8b \n"
+ "uaddlp v17.4h, v17.8b \n"
// combine source lines
- "add v20.4h, v20.4h, v22.4h \n"
- "add v21.4h, v21.4h, v23.4h \n"
- "add v20.4h, v20.4h, v24.4h \n"
- "add v21.4h, v21.4h, v25.4h \n"
- "add v2.4h, v1.4h, v5.4h \n"
- "add v2.4h, v2.4h, v17.4h \n"
+ "add v20.4h, v20.4h, v22.4h \n"
+ "add v21.4h, v21.4h, v23.4h \n"
+ "add v20.4h, v20.4h, v24.4h \n"
+ "add v21.4h, v21.4h, v25.4h \n"
+ "add v2.4h, v1.4h, v5.4h \n"
+ "add v2.4h, v2.4h, v17.4h \n"
// dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
// + s[6 + st * 1] + s[7 + st * 1]
// + s[6 + st * 2] + s[7 + st * 2]) / 6
- "sqrdmulh v2.8h, v2.8h, v29.8h \n"
- "xtn v2.8b, v2.8h \n"
+ "sqrdmulh v2.8h, v2.8h, v29.8h \n"
+ "xtn v2.8b, v2.8h \n"
// Shuffle 2,3 reg around so that 2 can be added to the
// 0,1 reg and 3 can be added to the 4,5 reg. This
@@ -372,35 +387,38 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
// registers are already expanded. Then do transposes
// to get aligned.
// xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
- "ushll v16.8h, v16.8b, #0 \n"
- "uaddl v0.8h, v0.8b, v4.8b \n"
+ "ushll v16.8h, v16.8b, #0 \n"
+ "uaddl v0.8h, v0.8b, v4.8b \n"
// combine source lines
- "add v0.8h, v0.8h, v16.8h \n"
+ "add v0.8h, v0.8h, v16.8h \n"
// xx 20 xx 21 xx 22 xx 23
// xx 30 xx 31 xx 32 xx 33
- "trn1 v1.8h, v0.8h, v0.8h \n"
- "trn2 v4.8h, v0.8h, v0.8h \n"
- "xtn v0.4h, v1.4s \n"
- "xtn v4.4h, v4.4s \n"
+ "trn1 v1.8h, v0.8h, v0.8h \n"
+ "trn2 v4.8h, v0.8h, v0.8h \n"
+ "xtn v0.4h, v1.4s \n"
+ "xtn v4.4h, v4.4s \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
// 0+1+2, 3+4+5
- "add v20.8h, v20.8h, v0.8h \n"
- "add v21.8h, v21.8h, v4.8h \n"
+ "add v20.8h, v20.8h, v0.8h \n"
+ "add v21.8h, v21.8h, v4.8h \n"
+ "prfm pldl1keep, [%2, 448] \n"
// Need to divide, but can't downshift as the the value
// isn't a power of 2. So multiply by 65536 / n
// and take the upper 16 bits.
- "sqrdmulh v0.8h, v20.8h, v31.8h \n"
- "sqrdmulh v1.8h, v21.8h, v31.8h \n"
+ "sqrdmulh v0.8h, v20.8h, v31.8h \n"
+ "sqrdmulh v1.8h, v21.8h, v31.8h \n"
+ "prfm pldl1keep, [%3, 448] \n"
// Align for table lookup, vtbl requires registers to be adjacent
- "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
+ "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
- "st1 {v3.8b}, [%1], #8 \n"
- "st1 {v3.s}[2], [%1], #4 \n"
- "b.gt 1b \n"
+ "st1 {v3.8b}, [%1], #8 \n"
+ "st1 {v3.s}[2], [%1], #4 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(tmp_src_stride), // %2
@@ -422,53 +440,53 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
// TODO(fbarchard): use src_stride directly for clang 3.5+.
ptrdiff_t tmp_src_stride = src_stride;
asm volatile(
- "ld1 {v30.8h}, [%4] \n"
- "ld1 {v31.16b}, [%5] \n"
- "add %2, %2, %0 \n"
- "1: \n"
+ "ld1 {v30.8h}, [%4] \n"
+ "ld1 {v31.16b}, [%5] \n"
+ "add %2, %2, %0 \n"
+ "1: \n"
// 00 40 01 41 02 42 03 43
// 10 50 11 51 12 52 13 53
// 20 60 21 61 22 62 23 63
// 30 70 31 71 32 72 33 73
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
- "subs %w3, %w3, #12 \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
+ "subs %w3, %w3, #12 \n"
// Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
// 00 10 01 11 02 12 03 13
// 40 50 41 51 42 52 43 53
- "trn1 v16.8b, v0.8b, v1.8b \n"
- "trn2 v17.8b, v0.8b, v1.8b \n"
- "trn1 v18.8b, v4.8b, v5.8b \n"
- "trn2 v19.8b, v4.8b, v5.8b \n"
+ "trn1 v16.8b, v0.8b, v1.8b \n"
+ "trn2 v17.8b, v0.8b, v1.8b \n"
+ "trn1 v18.8b, v4.8b, v5.8b \n"
+ "trn2 v19.8b, v4.8b, v5.8b \n"
// 20 30 21 31 22 32 23 33
// 60 70 61 71 62 72 63 73
- "trn1 v0.8b, v2.8b, v3.8b \n"
- "trn2 v1.8b, v2.8b, v3.8b \n"
- "trn1 v4.8b, v6.8b, v7.8b \n"
- "trn2 v5.8b, v6.8b, v7.8b \n"
+ "trn1 v0.8b, v2.8b, v3.8b \n"
+ "trn2 v1.8b, v2.8b, v3.8b \n"
+ "trn1 v4.8b, v6.8b, v7.8b \n"
+ "trn2 v5.8b, v6.8b, v7.8b \n"
// 00+10 01+11 02+12 03+13
// 40+50 41+51 42+52 43+53
- "uaddlp v16.4h, v16.8b \n"
- "uaddlp v17.4h, v17.8b \n"
- "uaddlp v18.4h, v18.8b \n"
- "uaddlp v19.4h, v19.8b \n"
+ "uaddlp v16.4h, v16.8b \n"
+ "uaddlp v17.4h, v17.8b \n"
+ "uaddlp v18.4h, v18.8b \n"
+ "uaddlp v19.4h, v19.8b \n"
// 60+70 61+71 62+72 63+73
- "uaddlp v1.4h, v1.8b \n"
- "uaddlp v5.4h, v5.8b \n"
+ "uaddlp v1.4h, v1.8b \n"
+ "uaddlp v5.4h, v5.8b \n"
// combine source lines
- "add v16.4h, v16.4h, v18.4h \n"
- "add v17.4h, v17.4h, v19.4h \n"
- "add v2.4h, v1.4h, v5.4h \n"
+ "add v16.4h, v16.4h, v18.4h \n"
+ "add v17.4h, v17.4h, v19.4h \n"
+ "add v2.4h, v1.4h, v5.4h \n"
// dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
- "uqrshrn v2.8b, v2.8h, #2 \n"
+ "uqrshrn v2.8b, v2.8h, #2 \n"
// Shuffle 2,3 reg around so that 2 can be added to the
// 0,1 reg and 3 can be added to the 4,5 reg. This
@@ -478,33 +496,35 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
// xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
// combine source lines
- "uaddl v0.8h, v0.8b, v4.8b \n"
+ "uaddl v0.8h, v0.8b, v4.8b \n"
// xx 20 xx 21 xx 22 xx 23
// xx 30 xx 31 xx 32 xx 33
- "trn1 v1.8h, v0.8h, v0.8h \n"
- "trn2 v4.8h, v0.8h, v0.8h \n"
- "xtn v0.4h, v1.4s \n"
- "xtn v4.4h, v4.4s \n"
+ "trn1 v1.8h, v0.8h, v0.8h \n"
+ "trn2 v4.8h, v0.8h, v0.8h \n"
+ "xtn v0.4h, v1.4s \n"
+ "xtn v4.4h, v4.4s \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
// 0+1+2, 3+4+5
- "add v16.8h, v16.8h, v0.8h \n"
- "add v17.8h, v17.8h, v4.8h \n"
+ "add v16.8h, v16.8h, v0.8h \n"
+ "add v17.8h, v17.8h, v4.8h \n"
+ "prfm pldl1keep, [%2, 448] \n"
// Need to divide, but can't downshift as the the value
// isn't a power of 2. So multiply by 65536 / n
// and take the upper 16 bits.
- "sqrdmulh v0.8h, v16.8h, v30.8h \n"
- "sqrdmulh v1.8h, v17.8h, v30.8h \n"
+ "sqrdmulh v0.8h, v16.8h, v30.8h \n"
+ "sqrdmulh v1.8h, v17.8h, v30.8h \n"
// Align for table lookup, vtbl requires registers to
// be adjacent
- "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
+ "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
- "st1 {v3.8b}, [%1], #8 \n"
- "st1 {v3.s}[2], [%1], #4 \n"
- "b.gt 1b \n"
+ "st1 {v3.8b}, [%1], #8 \n"
+ "st1 {v3.s}[2], [%1], #4 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(tmp_src_stride), // %2
@@ -522,13 +542,14 @@ void ScaleAddRow_NEON(const uint8_t* src_ptr,
int src_width) {
asm volatile(
"1: \n"
- "ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator
- "ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes
- "uaddw2 v2.8h, v2.8h, v0.16b \n" // add
- "uaddw v1.8h, v1.8h, v0.8b \n"
- "st1 {v1.8h, v2.8h}, [%1], #32 \n" // store accumulator
- "subs %w2, %w2, #16 \n" // 16 processed per loop
- "b.gt 1b \n"
+ "ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator
+ "ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes
+ "uaddw2 v2.8h, v2.8h, v0.16b \n" // add
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "uaddw v1.8h, v1.8h, v0.8b \n"
+ "st1 {v1.8h, v2.8h}, [%1], #32 \n" // store accumulator
+ "subs %w2, %w2, #16 \n" // 16 processed per loop
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(src_width) // %2
@@ -560,17 +581,17 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr,
int64_t x64 = (int64_t)x; // NOLINT
int64_t dx64 = (int64_t)dx; // NOLINT
asm volatile (
- "dup v0.4s, %w3 \n" // x
- "dup v1.4s, %w4 \n" // dx
- "ld1 {v2.4s}, [%5] \n" // 0 1 2 3
- "shl v3.4s, v1.4s, #2 \n" // 4 * dx
- "mul v1.4s, v1.4s, v2.4s \n"
+ "dup v0.4s, %w3 \n" // x
+ "dup v1.4s, %w4 \n" // dx
+ "ld1 {v2.4s}, [%5] \n" // 0 1 2 3
+ "shl v3.4s, v1.4s, #2 \n" // 4 * dx
+ "mul v1.4s, v1.4s, v2.4s \n"
// x , x + 1 * dx, x + 2 * dx, x + 3 * dx
- "add v1.4s, v1.4s, v0.4s \n"
+ "add v1.4s, v1.4s, v0.4s \n"
// x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
- "add v2.4s, v1.4s, v3.4s \n"
- "shl v0.4s, v3.4s, #1 \n" // 8 * dx
- "1: \n"
+ "add v2.4s, v1.4s, v3.4s \n"
+ "shl v0.4s, v3.4s, #1 \n" // 8 * dx
+ "1: \n"
LOAD2_DATA8_LANE(0)
LOAD2_DATA8_LANE(1)
LOAD2_DATA8_LANE(2)
@@ -579,27 +600,27 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr,
LOAD2_DATA8_LANE(5)
LOAD2_DATA8_LANE(6)
LOAD2_DATA8_LANE(7)
- "mov v6.16b, v1.16b \n"
- "mov v7.16b, v2.16b \n"
- "uzp1 v6.8h, v6.8h, v7.8h \n"
- "ushll v4.8h, v4.8b, #0 \n"
- "ushll v5.8h, v5.8b, #0 \n"
- "ssubl v16.4s, v5.4h, v4.4h \n"
- "ssubl2 v17.4s, v5.8h, v4.8h \n"
- "ushll v7.4s, v6.4h, #0 \n"
- "ushll2 v6.4s, v6.8h, #0 \n"
- "mul v16.4s, v16.4s, v7.4s \n"
- "mul v17.4s, v17.4s, v6.4s \n"
- "rshrn v6.4h, v16.4s, #16 \n"
- "rshrn2 v6.8h, v17.4s, #16 \n"
- "add v4.8h, v4.8h, v6.8h \n"
- "xtn v4.8b, v4.8h \n"
+ "mov v6.16b, v1.16b \n"
+ "mov v7.16b, v2.16b \n"
+ "uzp1 v6.8h, v6.8h, v7.8h \n"
+ "ushll v4.8h, v4.8b, #0 \n"
+ "ushll v5.8h, v5.8b, #0 \n"
+ "ssubl v16.4s, v5.4h, v4.4h \n"
+ "ssubl2 v17.4s, v5.8h, v4.8h \n"
+ "ushll v7.4s, v6.4h, #0 \n"
+ "ushll2 v6.4s, v6.8h, #0 \n"
+ "mul v16.4s, v16.4s, v7.4s \n"
+ "mul v17.4s, v17.4s, v6.4s \n"
+ "rshrn v6.4h, v16.4s, #16 \n"
+ "rshrn2 v6.8h, v17.4s, #16 \n"
+ "add v4.8h, v4.8h, v6.8h \n"
+ "xtn v4.8b, v4.8h \n"
- "st1 {v4.8b}, [%0], #8 \n" // store pixels
- "add v1.4s, v1.4s, v0.4s \n"
- "add v2.4s, v2.4s, v0.4s \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop
- "b.gt 1b \n"
+ "st1 {v4.8b}, [%0], #8 \n" // store pixels
+ "add v1.4s, v1.4s, v0.4s \n"
+ "add v2.4s, v2.4s, v0.4s \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "b.gt 1b \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(dst_width), // %2
@@ -623,74 +644,83 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
int source_y_fraction) {
int y_fraction = 256 - source_y_fraction;
asm volatile(
- "cmp %w4, #0 \n"
- "b.eq 100f \n"
- "add %2, %2, %1 \n"
- "cmp %w4, #64 \n"
- "b.eq 75f \n"
- "cmp %w4, #128 \n"
- "b.eq 50f \n"
- "cmp %w4, #192 \n"
- "b.eq 25f \n"
+ "cmp %w4, #0 \n"
+ "b.eq 100f \n"
+ "add %2, %2, %1 \n"
+ "cmp %w4, #64 \n"
+ "b.eq 75f \n"
+ "cmp %w4, #128 \n"
+ "b.eq 50f \n"
+ "cmp %w4, #192 \n"
+ "b.eq 25f \n"
- "dup v5.8b, %w4 \n"
- "dup v4.8b, %w5 \n"
+ "dup v5.8b, %w4 \n"
+ "dup v4.8b, %w5 \n"
// General purpose row blend.
"1: \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- "ld1 {v1.16b}, [%2], #16 \n"
- "subs %w3, %w3, #16 \n"
- "umull v6.8h, v0.8b, v4.8b \n"
- "umull2 v7.8h, v0.16b, v4.16b \n"
- "umlal v6.8h, v1.8b, v5.8b \n"
- "umlal2 v7.8h, v1.16b, v5.16b \n"
- "rshrn v0.8b, v6.8h, #8 \n"
- "rshrn2 v0.16b, v7.8h, #8 \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 1b \n"
- "b 99f \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "umull v6.8h, v0.8b, v4.8b \n"
+ "umull2 v7.8h, v0.16b, v4.16b \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+ "umlal v6.8h, v1.8b, v5.8b \n"
+ "umlal2 v7.8h, v1.16b, v5.16b \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "rshrn v0.8b, v6.8h, #8 \n"
+ "rshrn2 v0.16b, v7.8h, #8 \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 1b \n"
+ "b 99f \n"
// Blend 25 / 75.
"25: \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- "ld1 {v1.16b}, [%2], #16 \n"
- "subs %w3, %w3, #16 \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 25b \n"
- "b 99f \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 25b \n"
+ "b 99f \n"
// Blend 50 / 50.
"50: \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- "ld1 {v1.16b}, [%2], #16 \n"
- "subs %w3, %w3, #16 \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 50b \n"
- "b 99f \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 50b \n"
+ "b 99f \n"
// Blend 75 / 25.
"75: \n"
- "ld1 {v1.16b}, [%1], #16 \n"
- "ld1 {v0.16b}, [%2], #16 \n"
- "subs %w3, %w3, #16 \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 75b \n"
- "b 99f \n"
+ "ld1 {v1.16b}, [%1], #16 \n"
+ "ld1 {v0.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 75b \n"
+ "b 99f \n"
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- "subs %w3, %w3, #16 \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 100b \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 100b \n"
"99: \n"
- "st1 {v0.b}[15], [%0] \n"
+ "st1 {v0.b}[15], [%0] \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(src_stride), // %2
@@ -709,11 +739,12 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
asm volatile(
"1: \n"
// load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
- "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop
- "mov v2.16b, v3.16b \n"
- "st2 {v1.4s,v2.4s}, [%1], #32 \n" // store 8 odd pixels
- "b.gt 1b \n"
+ "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "mov v2.16b, v3.16b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st2 {v1.4s,v2.4s}, [%1], #32 \n" // store 8 odd pixels
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
"+r"(dst_width) // %2
@@ -730,13 +761,14 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
asm volatile(
"1: \n"
// load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
- "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
- "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
- "urhadd v1.16b, v2.16b, v3.16b \n"
- "st2 {v0.4s,v1.4s}, [%1], #32 \n" // store 8 pixels
- "b.gt 1b \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "urhadd v1.16b, v2.16b, v3.16b \n"
+ "st2 {v0.4s,v1.4s}, [%1], #32 \n" // store 8 pixels
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
@@ -751,25 +783,27 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
int dst_width) {
asm volatile(
// change the stride to row 2 pointer
- "add %1, %1, %0 \n"
+ "add %1, %1, %0 \n"
"1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
- "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
- "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8
- "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts.
- "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts.
- "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts.
- "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts.
- "rshrn v0.8b, v0.8h, #2 \n" // round and pack
- "rshrn v1.8b, v1.8h, #2 \n"
- "rshrn v2.8b, v2.8h, #2 \n"
- "rshrn v3.8b, v3.8h, #2 \n"
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
- "b.gt 1b \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+ "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
+ "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8
+ "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts.
+ "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
+ "rshrn v0.8b, v0.8h, #2 \n" // round and pack
+ "rshrn v1.8b, v1.8h, #2 \n"
+ "rshrn v2.8b, v2.8h, #2 \n"
+ "rshrn v3.8b, v3.8h, #2 \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
"+r"(dst), // %2
@@ -788,13 +822,14 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
(void)src_stride;
asm volatile(
"1: \n"
- "ld1 {v0.s}[0], [%0], %3 \n"
- "ld1 {v0.s}[1], [%0], %3 \n"
- "ld1 {v0.s}[2], [%0], %3 \n"
- "ld1 {v0.s}[3], [%0], %3 \n"
- "subs %w2, %w2, #4 \n" // 4 pixels per loop.
- "st1 {v0.16b}, [%1], #16 \n"
- "b.gt 1b \n"
+ "ld1 {v0.s}[0], [%0], %3 \n"
+ "ld1 {v0.s}[1], [%0], %3 \n"
+ "ld1 {v0.s}[2], [%0], %3 \n"
+ "ld1 {v0.s}[3], [%0], %3 \n"
+ "subs %w2, %w2, #4 \n" // 4 pixels per loop.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st1 {v0.16b}, [%1], #16 \n"
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
@@ -812,33 +847,35 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
uint8_t* dst_argb,
int dst_width) {
asm volatile(
- "add %1, %1, %0 \n"
+ "add %1, %1, %0 \n"
"1: \n"
- "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1
- "ld1 {v1.8b}, [%1], %4 \n"
- "ld1 {v2.8b}, [%0], %4 \n"
- "ld1 {v3.8b}, [%1], %4 \n"
- "ld1 {v4.8b}, [%0], %4 \n"
- "ld1 {v5.8b}, [%1], %4 \n"
- "ld1 {v6.8b}, [%0], %4 \n"
- "ld1 {v7.8b}, [%1], %4 \n"
- "uaddl v0.8h, v0.8b, v1.8b \n"
- "uaddl v2.8h, v2.8b, v3.8b \n"
- "uaddl v4.8h, v4.8b, v5.8b \n"
- "uaddl v6.8h, v6.8b, v7.8b \n"
- "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd
- "mov v0.d[1], v2.d[0] \n"
- "mov v2.d[0], v16.d[1] \n"
- "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh
- "mov v4.d[1], v6.d[0] \n"
- "mov v6.d[0], v16.d[1] \n"
- "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d)
- "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h)
- "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels.
- "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels.
- "subs %w3, %w3, #4 \n" // 4 pixels per loop.
- "st1 {v0.16b}, [%2], #16 \n"
- "b.gt 1b \n"
+ "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1
+ "ld1 {v1.8b}, [%1], %4 \n"
+ "ld1 {v2.8b}, [%0], %4 \n"
+ "ld1 {v3.8b}, [%1], %4 \n"
+ "ld1 {v4.8b}, [%0], %4 \n"
+ "ld1 {v5.8b}, [%1], %4 \n"
+ "ld1 {v6.8b}, [%0], %4 \n"
+ "ld1 {v7.8b}, [%1], %4 \n"
+ "uaddl v0.8h, v0.8b, v1.8b \n"
+ "uaddl v2.8h, v2.8b, v3.8b \n"
+ "uaddl v4.8h, v4.8b, v5.8b \n"
+ "uaddl v6.8h, v6.8b, v7.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd
+ "mov v0.d[1], v2.d[0] \n"
+ "mov v2.d[0], v16.d[1] \n"
+ "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh
+ "mov v4.d[1], v6.d[0] \n"
+ "mov v6.d[0], v16.d[1] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d)
+ "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h)
+ "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels.
+ "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels.
+ "subs %w3, %w3, #4 \n" // 4 pixels per loop.
+ "st1 {v0.16b}, [%2], #16 \n"
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(src_stride), // %1
"+r"(dst_argb), // %2
@@ -875,10 +912,11 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb,
LOAD1_DATA32_LANE(v1, 1)
LOAD1_DATA32_LANE(v1, 2)
LOAD1_DATA32_LANE(v1, 3)
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
// clang-format on
- "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
- "subs %w2, %w2, #8 \n" // 8 processed per loop
- "b.gt 1b \n"
+ "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
"+r"(dst_width), // %2
@@ -911,16 +949,16 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
int64_t x64 = (int64_t)x; // NOLINT
int64_t dx64 = (int64_t)dx; // NOLINT
asm volatile (
- "dup v0.4s, %w3 \n" // x
- "dup v1.4s, %w4 \n" // dx
- "ld1 {v2.4s}, [%5] \n" // 0 1 2 3
- "shl v6.4s, v1.4s, #2 \n" // 4 * dx
- "mul v1.4s, v1.4s, v2.4s \n"
- "movi v3.16b, #0x7f \n" // 0x7F
- "movi v4.8h, #0x7f \n" // 0x7F
+ "dup v0.4s, %w3 \n" // x
+ "dup v1.4s, %w4 \n" // dx
+ "ld1 {v2.4s}, [%5] \n" // 0 1 2 3
+ "shl v6.4s, v1.4s, #2 \n" // 4 * dx
+ "mul v1.4s, v1.4s, v2.4s \n"
+ "movi v3.16b, #0x7f \n" // 0x7F
+ "movi v4.8h, #0x7f \n" // 0x7F
// x , x + 1 * dx, x + 2 * dx, x + 3 * dx
- "add v5.4s, v1.4s, v0.4s \n"
- "1: \n"
+ "add v5.4s, v1.4s, v0.4s \n"
+ "1: \n"
// d0, d1: a
// d2, d3: b
LOAD2_DATA32_LANE(v0, v1, 0)
@@ -941,15 +979,15 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
"umull2 v17.8h, v0.16b, v7.16b \n"
"umull v18.8h, v1.8b, v2.8b \n"
"umull2 v19.8h, v1.16b, v2.16b \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
"add v16.8h, v16.8h, v18.8h \n"
"add v17.8h, v17.8h, v19.8h \n"
"shrn v0.8b, v16.8h, #7 \n"
"shrn2 v0.16b, v17.8h, #7 \n"
-
"st1 {v0.4s}, [%0], #16 \n" // store pixels
"add v5.4s, v5.4s, v6.4s \n"
"subs %w2, %w2, #4 \n" // 4 processed per loop
- "b.gt 1b \n"
+ "b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
"+r"(dst_width), // %2
@@ -972,19 +1010,21 @@ void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
int dst_width) {
asm volatile(
// change the stride to row 2 pointer
- "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
+ "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
"1: \n"
- "ld1 {v0.8h, v1.8h}, [%0], #32 \n" // load row 1 and post inc
- "ld1 {v2.8h, v3.8h}, [%1], #32 \n" // load row 2 and post inc
- "subs %w3, %w3, #8 \n" // 8 processed per loop
- "uaddlp v0.4s, v0.8h \n" // row 1 add adjacent
- "uaddlp v1.4s, v1.8h \n"
- "uadalp v0.4s, v2.8h \n" // +row 2 add adjacent
- "uadalp v1.4s, v3.8h \n"
- "rshrn v0.4h, v0.4s, #2 \n" // round and pack
- "rshrn2 v0.8h, v1.4s, #2 \n"
- "st1 {v0.8h}, [%2], #16 \n"
- "b.gt 1b \n"
+ "ld1 {v0.8h, v1.8h}, [%0], #32 \n" // load row 1 and post inc
+ "ld1 {v2.8h, v3.8h}, [%1], #32 \n" // load row 2 and post inc
+ "subs %w3, %w3, #8 \n" // 8 processed per loop
+ "uaddlp v0.4s, v0.8h \n" // row 1 add adjacent
+ "uaddlp v1.4s, v1.8h \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "uadalp v0.4s, v2.8h \n" // +row 2 add adjacent
+ "uadalp v1.4s, v3.8h \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "rshrn v0.4h, v0.4s, #2 \n" // round and pack
+ "rshrn2 v0.8h, v1.4s, #2 \n"
+ "st1 {v0.8h}, [%2], #16 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
"+r"(dst), // %2
@@ -1001,38 +1041,40 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
uint16_t* dst,
int dst_width) {
asm volatile(
- "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
- "movi v0.8h, #9 \n" // constants
- "movi v1.4s, #3 \n"
+ "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
+ "movi v0.8h, #9 \n" // constants
+ "movi v1.4s, #3 \n"
"1: \n"
- "ld1 {v3.8h}, [%0], %4 \n" // TL read first 8
- "ld1 {v4.8h}, [%0], %5 \n" // TR read 8 offset by 1
- "ld1 {v5.8h}, [%1], %4 \n" // BL read 8 from next row
- "ld1 {v6.8h}, [%1], %5 \n" // BR offset by 1
- "subs %w3, %w3, #16 \n" // 16 dst pixels per loop
- "umull v16.4s, v3.4h, v0.4h \n"
- "umull2 v7.4s, v3.8h, v0.8h \n"
- "umull v18.4s, v4.4h, v0.4h \n"
- "umull2 v17.4s, v4.8h, v0.8h \n"
- "uaddw v16.4s, v16.4s, v6.4h \n"
- "uaddl2 v19.4s, v6.8h, v3.8h \n"
- "uaddl v3.4s, v6.4h, v3.4h \n"
- "uaddw2 v6.4s, v7.4s, v6.8h \n"
- "uaddl2 v7.4s, v5.8h, v4.8h \n"
- "uaddl v4.4s, v5.4h, v4.4h \n"
- "uaddw v18.4s, v18.4s, v5.4h \n"
- "mla v16.4s, v4.4s, v1.4s \n"
- "mla v18.4s, v3.4s, v1.4s \n"
- "mla v6.4s, v7.4s, v1.4s \n"
- "uaddw2 v4.4s, v17.4s, v5.8h \n"
- "uqrshrn v16.4h, v16.4s, #4 \n"
- "mla v4.4s, v19.4s, v1.4s \n"
- "uqrshrn2 v16.8h, v6.4s, #4 \n"
- "uqrshrn v17.4h, v18.4s, #4 \n"
- "uqrshrn2 v17.8h, v4.4s, #4 \n"
- "st2 {v16.8h-v17.8h}, [%2], #32 \n"
- "b.gt 1b \n"
+ "ld1 {v3.8h}, [%0], %4 \n" // TL read first 8
+ "ld1 {v4.8h}, [%0], %5 \n" // TR read 8 offset by 1
+ "ld1 {v5.8h}, [%1], %4 \n" // BL read 8 from next row
+ "ld1 {v6.8h}, [%1], %5 \n" // BR offset by 1
+ "subs %w3, %w3, #16 \n" // 16 dst pixels per loop
+ "umull v16.4s, v3.4h, v0.4h \n"
+ "umull2 v7.4s, v3.8h, v0.8h \n"
+ "umull v18.4s, v4.4h, v0.4h \n"
+ "umull2 v17.4s, v4.8h, v0.8h \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "uaddw v16.4s, v16.4s, v6.4h \n"
+ "uaddl2 v19.4s, v6.8h, v3.8h \n"
+ "uaddl v3.4s, v6.4h, v3.4h \n"
+ "uaddw2 v6.4s, v7.4s, v6.8h \n"
+ "uaddl2 v7.4s, v5.8h, v4.8h \n"
+ "uaddl v4.4s, v5.4h, v4.4h \n"
+ "uaddw v18.4s, v18.4s, v5.4h \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "mla v16.4s, v4.4s, v1.4s \n"
+ "mla v18.4s, v3.4s, v1.4s \n"
+ "mla v6.4s, v7.4s, v1.4s \n"
+ "uaddw2 v4.4s, v17.4s, v5.8h \n"
+ "uqrshrn v16.4h, v16.4s, #4 \n"
+ "mla v4.4s, v19.4s, v1.4s \n"
+ "uqrshrn2 v16.8h, v6.4s, #4 \n"
+ "uqrshrn v17.4h, v18.4s, #4 \n"
+ "uqrshrn2 v17.8h, v4.4s, #4 \n"
+ "st2 {v16.8h-v17.8h}, [%2], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
"+r"(dst), // %2
@@ -1044,6 +1086,64 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
);
}
+void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ asm volatile(
+ // change the stride to row 2 pointer
+ "add %1, %1, %0 \n"
+ "1: \n"
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 UV
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uaddlp v0.8h, v0.16b \n" // U 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // V 16 bytes -> 8 shorts.
+ "ld2 {v16.16b,v17.16b}, [%1], #32 \n" // load 16
+ "uadalp v0.8h, v16.16b \n" // U 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v17.16b \n" // V 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "rshrn v0.8b, v0.8h, #2 \n" // round and pack
+ "prfm pldl1keep, [%1, 448] \n"
+ "rshrn v1.8b, v1.8h, #2 \n"
+ "st2 {v0.8b,v1.8b}, [%2], #16 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ :
+ : "memory", "cc", "v0", "v1", "v16", "v17");
+}
+
+// Reads 4 pixels at a time.
+void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx, // pixel step
+ uint8_t* dst_ptr,
+ int dst_width) {
+ const uint8_t* src1_ptr = src_ptr + src_stepx * 2;
+ const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
+ const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.h}[0], [%0], %6 \n"
+ "ld1 {v1.h}[0], [%1], %6 \n"
+ "ld1 {v2.h}[0], [%2], %6 \n"
+ "ld1 {v3.h}[0], [%3], %6 \n"
+ "subs %w5, %w5, #4 \n" // 4 pixels per loop.
+ "st4 {v0.h, v1.h, v2.h, v3.h}[0], [%4], #8 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src1_ptr), // %1
+ "+r"(src2_ptr), // %2
+ "+r"(src3_ptr), // %3
+ "+r"(dst_ptr), // %4
+ "+r"(dst_width) // %5
+ : "r"((int64_t)(src_stepx * 8)) // %6
+ : "memory", "cc", "v0", "v1", "v2", "v3");
+}
+
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#ifdef __cplusplus
diff --git a/TMessagesProj/jni/third_party/libyuv/source/scale_uv.cc b/TMessagesProj/jni/third_party/libyuv/source/scale_uv.cc
new file mode 100644
index 000000000..b0469f09b
--- /dev/null
+++ b/TMessagesProj/jni/third_party/libyuv/source/scale_uv.cc
@@ -0,0 +1,891 @@
+/*
+ * Copyright 2020 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include
+#include
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h" // For CopyUV
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Macros to enable specialized scalers
+
+#ifndef HAS_SCALEUVDOWN2
+#define HAS_SCALEUVDOWN2 1
+#endif
+#ifndef HAS_SCALEUVDOWN4BOX
+#define HAS_SCALEUVDOWN4BOX 1
+#endif
+#ifndef HAS_SCALEUVDOWNEVEN
+#define HAS_SCALEUVDOWNEVEN 1
+#endif
+#ifndef HAS_SCALEUVBILINEARDOWN
+#define HAS_SCALEUVBILINEARDOWN 1
+#endif
+#ifndef HAS_SCALEUVBILINEARUP
+#define HAS_SCALEUVBILINEARUP 1
+#endif
+#ifndef HAS_UVCOPY
+#define HAS_UVCOPY 1
+#endif
+#ifndef HAS_SCALEPLANEVERTICAL
+#define HAS_SCALEPLANEVERTICAL 1
+#endif
+
+static __inline int Abs(int v) {
+ return v >= 0 ? v : -v;
+}
+
+// ScaleUV, 1/2
+// This is an optimized version for scaling down a UV to 1/2 of
+// its original size.
+#if HAS_SCALEUVDOWN2
+static void ScaleUVDown2(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_uv,
+ uint8_t* dst_uv,
+ int x,
+ int dx,
+ int y,
+ int dy,
+ enum FilterMode filtering) {
+ int j;
+ int row_stride = src_stride * (dy >> 16);
+ void (*ScaleUVRowDown2)(const uint8_t* src_uv, ptrdiff_t src_stride,
+ uint8_t* dst_uv, int dst_width) =
+ filtering == kFilterNone
+ ? ScaleUVRowDown2_C
+ : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_C
+ : ScaleUVRowDown2Box_C);
+ (void)src_width;
+ (void)src_height;
+ (void)dx;
+ assert(dx == 65536 * 2); // Test scale factor of 2.
+ assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2.
+ // Advance to odd row, even column.
+ if (filtering == kFilterBilinear) {
+ src_uv += (y >> 16) * src_stride + (x >> 16) * 2;
+ } else {
+ src_uv += (y >> 16) * src_stride + ((x >> 16) - 1) * 2;
+ }
+
+#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && filtering) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSSE3;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2BOX_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && filtering) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_AVX2;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2BOX_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && filtering) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_NEON;
+ }
+ }
+#endif
+
+// This code is not enabled. Only box filter is available at this time.
+#if defined(HAS_SCALEUVROWDOWN2_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleUVRowDown2 =
+ filtering == kFilterNone
+ ? ScaleUVRowDown2_Any_SSSE3
+ : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_SSSE3
+ : ScaleUVRowDown2Box_Any_SSSE3);
+ if (IS_ALIGNED(dst_width, 2)) {
+ ScaleUVRowDown2 =
+ filtering == kFilterNone
+ ? ScaleUVRowDown2_SSSE3
+ : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_SSSE3
+ : ScaleUVRowDown2Box_SSSE3);
+ }
+ }
+#endif
+// This code is not enabled. Only box filter is available at this time.
+#if defined(HAS_SCALEUVROWDOWN2_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleUVRowDown2 =
+ filtering == kFilterNone
+ ? ScaleUVRowDown2_Any_NEON
+ : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_NEON
+ : ScaleUVRowDown2Box_Any_NEON);
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVRowDown2 =
+ filtering == kFilterNone
+ ? ScaleUVRowDown2_NEON
+ : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_NEON
+ : ScaleUVRowDown2Box_NEON);
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ScaleUVRowDown2 =
+ filtering == kFilterNone
+ ? ScaleUVRowDown2_Any_MMI
+ : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_MMI
+ : ScaleUVRowDown2Box_Any_MMI);
+ if (IS_ALIGNED(dst_width, 2)) {
+ ScaleUVRowDown2 =
+ filtering == kFilterNone
+ ? ScaleUVRowDown2_MMI
+ : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_MMI
+ : ScaleUVRowDown2Box_MMI);
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleUVRowDown2 =
+ filtering == kFilterNone
+ ? ScaleUVRowDown2_Any_MSA
+ : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_MSA
+ : ScaleUVRowDown2Box_Any_MSA);
+ if (IS_ALIGNED(dst_width, 2)) {
+ ScaleUVRowDown2 =
+ filtering == kFilterNone
+ ? ScaleUVRowDown2_MSA
+ : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_MSA
+ : ScaleUVRowDown2Box_MSA);
+ }
+ }
+#endif
+
+ if (filtering == kFilterLinear) {
+ src_stride = 0;
+ }
+ for (j = 0; j < dst_height; ++j) {
+ ScaleUVRowDown2(src_uv, src_stride, dst_uv, dst_width);
+ src_uv += row_stride;
+ dst_uv += dst_stride;
+ }
+}
+#endif // HAS_SCALEUVDOWN2
+
+// ScaleUV, 1/4
+// This is an optimized version for scaling down a UV to 1/4 of
+// its original size.
+#if HAS_SCALEUVDOWN4BOX
+static void ScaleUVDown4Box(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_uv,
+ uint8_t* dst_uv,
+ int x,
+ int dx,
+ int y,
+ int dy) {
+ int j;
+ // Allocate 2 rows of UV.
+ const int kRowSize = (dst_width * 2 * 2 + 15) & ~15;
+ align_buffer_64(row, kRowSize * 2);
+ int row_stride = src_stride * (dy >> 16);
+ void (*ScaleUVRowDown2)(const uint8_t* src_uv, ptrdiff_t src_stride,
+ uint8_t* dst_uv, int dst_width) =
+ ScaleUVRowDown2Box_C;
+ // Advance to odd row, even column.
+ src_uv += (y >> 16) * src_stride + (x >> 16) * 2;
+ (void)src_width;
+ (void)src_height;
+ (void)dx;
+ assert(dx == 65536 * 4); // Test scale factor of 4.
+ assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4.
+
+#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSSE3;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2BOX_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_AVX2;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2BOX_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_NEON;
+ }
+ }
+#endif
+
+ for (j = 0; j < dst_height; ++j) {
+ ScaleUVRowDown2(src_uv, src_stride, row, dst_width * 2);
+ ScaleUVRowDown2(src_uv + src_stride * 2, src_stride, row + kRowSize,
+ dst_width * 2);
+ ScaleUVRowDown2(row, kRowSize, dst_uv, dst_width);
+ src_uv += row_stride;
+ dst_uv += dst_stride;
+ }
+ free_aligned_buffer_64(row);
+}
+#endif // HAS_SCALEUVDOWN4BOX
+
+// ScaleUV Even
+// This is an optimized version for scaling down a UV to even
+// multiple of its original size.
+#if HAS_SCALEUVDOWNEVEN
+static void ScaleUVDownEven(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_uv,
+ uint8_t* dst_uv,
+ int x,
+ int dx,
+ int y,
+ int dy,
+ enum FilterMode filtering) {
+ int j;
+ int col_step = dx >> 16;
+ int row_stride = (dy >> 16) * src_stride;
+ void (*ScaleUVRowDownEven)(const uint8_t* src_uv, ptrdiff_t src_stride,
+ int src_step, uint8_t* dst_uv, int dst_width) =
+ filtering ? ScaleUVRowDownEvenBox_C : ScaleUVRowDownEven_C;
+ (void)src_width;
+ (void)src_height;
+ assert(IS_ALIGNED(src_width, 2));
+ assert(IS_ALIGNED(src_height, 2));
+ src_uv += (y >> 16) * src_stride + (x >> 16) * 2;
+#if defined(HAS_SCALEUVROWDOWNEVEN_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_SSSE3
+ : ScaleUVRowDownEven_Any_SSSE3;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVRowDownEven =
+ filtering ? ScaleUVRowDownEvenBox_SSE2 : ScaleUVRowDownEven_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWNEVEN_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && !filtering) {
+ ScaleUVRowDownEven = ScaleUVRowDownEven_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVRowDownEven = ScaleUVRowDownEven_NEON;
+ }
+ }
+#endif// TODO(fbarchard): Enable Box filter
+#if defined(HAS_SCALEUVROWDOWNEVENBOX_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_NEON
+ : ScaleUVRowDownEven_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVRowDownEven =
+ filtering ? ScaleUVRowDownEvenBox_NEON : ScaleUVRowDownEven_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWNEVEN_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ScaleUVRowDownEven =
+ filtering ? ScaleUVRowDownEvenBox_Any_MMI : ScaleUVRowDownEven_Any_MMI;
+ if (IS_ALIGNED(dst_width, 2)) {
+ ScaleUVRowDownEven =
+ filtering ? ScaleUVRowDownEvenBox_MMI : ScaleUVRowDownEven_MMI;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWNEVEN_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleUVRowDownEven =
+ filtering ? ScaleUVRowDownEvenBox_Any_MSA : ScaleUVRowDownEven_Any_MSA;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVRowDownEven =
+ filtering ? ScaleUVRowDownEvenBox_MSA : ScaleUVRowDownEven_MSA;
+ }
+ }
+#endif
+
+ if (filtering == kFilterLinear) {
+ src_stride = 0;
+ }
+ for (j = 0; j < dst_height; ++j) {
+ ScaleUVRowDownEven(src_uv, src_stride, col_step, dst_uv, dst_width);
+ src_uv += row_stride;
+ dst_uv += dst_stride;
+ }
+}
+#endif
+
+// Scale UV down with bilinear interpolation.
+#if HAS_SCALEUVBILINEARDOWN
+static void ScaleUVBilinearDown(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_uv,
+ uint8_t* dst_uv,
+ int x,
+ int dx,
+ int y,
+ int dy,
+ enum FilterMode filtering) {
+ int j;
+ void (*InterpolateRow)(uint8_t * dst_uv, const uint8_t* src_uv,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
+ void (*ScaleUVFilterCols)(uint8_t * dst_uv, const uint8_t* src_uv,
+ int dst_width, int x, int dx) =
+ (src_width >= 32768) ? ScaleUVFilterCols64_C : ScaleUVFilterCols_C;
+ int64_t xlast = x + (int64_t)(dst_width - 1) * dx;
+ int64_t xl = (dx >= 0) ? x : xlast;
+ int64_t xr = (dx >= 0) ? xlast : x;
+ int clip_src_width;
+ xl = (xl >> 16) & ~3; // Left edge aligned.
+ xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels.
+ xr = (xr + 1 + 3) & ~3; // 1 beyond 4 pixel aligned right most pixel.
+ if (xr > src_width) {
+ xr = src_width;
+ }
+ clip_src_width = (int)(xr - xl) * 2; // Width aligned to 2.
+ src_uv += xl * 2;
+ x -= (int)(xl << 16);
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(clip_src_width, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(clip_src_width, 32)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(clip_src_width, 16)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(clip_src_width, 32)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVFILTERCOLS_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleUVFilterCols = ScaleUVFilterCols_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEUVFILTERCOLS_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleUVFilterCols = ScaleUVFilterCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVFilterCols = ScaleUVFilterCols_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVFILTERCOLS_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleUVFilterCols = ScaleUVFilterCols_Any_MSA;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVFilterCols = ScaleUVFilterCols_MSA;
+ }
+ }
+#endif
+ // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+ // Allocate a row of UV.
+ {
+ align_buffer_64(row, clip_src_width * 2);
+
+ const int max_y = (src_height - 1) << 16;
+ if (y > max_y) {
+ y = max_y;
+ }
+ for (j = 0; j < dst_height; ++j) {
+ int yi = y >> 16;
+ const uint8_t* src = src_uv + yi * src_stride;
+ if (filtering == kFilterLinear) {
+ ScaleUVFilterCols(dst_uv, src, dst_width, x, dx);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(row, src, src_stride, clip_src_width, yf);
+ ScaleUVFilterCols(dst_uv, row, dst_width, x, dx);
+ }
+ dst_uv += dst_stride;
+ y += dy;
+ if (y > max_y) {
+ y = max_y;
+ }
+ }
+ free_aligned_buffer_64(row);
+ }
+}
+#endif
+
+// Scale UV up with bilinear interpolation.
+#if HAS_SCALEUVBILINEARUP
+static void ScaleUVBilinearUp(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_uv,
+ uint8_t* dst_uv,
+ int x,
+ int dx,
+ int y,
+ int dy,
+ enum FilterMode filtering) {
+ int j;
+ void (*InterpolateRow)(uint8_t * dst_uv, const uint8_t* src_uv,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
+ void (*ScaleUVFilterCols)(uint8_t * dst_uv, const uint8_t* src_uv,
+ int dst_width, int x, int dx) =
+ filtering ? ScaleUVFilterCols_C : ScaleUVCols_C;
+ const int max_y = (src_height - 1) << 16;
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(dst_width, 8)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ InterpolateRow = InterpolateRow_Any_MMI;
+ if (IS_ALIGNED(dst_width, 2)) {
+ InterpolateRow = InterpolateRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(dst_width, 8)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
+ if (src_width >= 32768) {
+ ScaleUVFilterCols = filtering ? ScaleUVFilterCols64_C : ScaleUVCols64_C;
+ }
+#if defined(HAS_SCALEUVFILTERCOLS_SSSE3)
+ if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleUVFilterCols = ScaleUVFilterCols_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEUVFILTERCOLS_NEON)
+ if (filtering && TestCpuFlag(kCpuHasNEON)) {
+ ScaleUVFilterCols = ScaleUVFilterCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVFilterCols = ScaleUVFilterCols_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVFILTERCOLS_MSA)
+ if (filtering && TestCpuFlag(kCpuHasMSA)) {
+ ScaleUVFilterCols = ScaleUVFilterCols_Any_MSA;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVFilterCols = ScaleUVFilterCols_MSA;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVCOLS_SSSE3)
+ if (!filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleUVFilterCols = ScaleUVCols_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEUVCOLS_NEON)
+ if (!filtering && TestCpuFlag(kCpuHasNEON)) {
+ ScaleUVFilterCols = ScaleUVCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVFilterCols = ScaleUVCols_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVCOLS_MMI)
+ if (!filtering && TestCpuFlag(kCpuHasMMI)) {
+ ScaleUVFilterCols = ScaleUVCols_Any_MMI;
+ if (IS_ALIGNED(dst_width, 1)) {
+ ScaleUVFilterCols = ScaleUVCols_MMI;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVCOLS_MSA)
+ if (!filtering && TestCpuFlag(kCpuHasMSA)) {
+ ScaleUVFilterCols = ScaleUVCols_Any_MSA;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVFilterCols = ScaleUVCols_MSA;
+ }
+ }
+#endif
+ if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+ ScaleUVFilterCols = ScaleUVColsUp2_C;
+#if defined(HAS_SCALEUVCOLSUP2_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(dst_width, 8)) {
+ ScaleUVFilterCols = ScaleUVColsUp2_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEUVCOLSUP2_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
+ ScaleUVFilterCols = ScaleUVColsUp2_MMI;
+ }
+#endif
+ }
+
+ if (y > max_y) {
+ y = max_y;
+ }
+
+ {
+ int yi = y >> 16;
+ const uint8_t* src = src_uv + yi * src_stride;
+
+ // Allocate 2 rows of UV.
+ const int kRowSize = (dst_width * 2 + 15) & ~15;
+ align_buffer_64(row, kRowSize * 2);
+
+ uint8_t* rowptr = row;
+ int rowstride = kRowSize;
+ int lasty = yi;
+
+ ScaleUVFilterCols(rowptr, src, dst_width, x, dx);
+ if (src_height > 1) {
+ src += src_stride;
+ }
+ ScaleUVFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+ src += src_stride;
+
+ for (j = 0; j < dst_height; ++j) {
+ yi = y >> 16;
+ if (yi != lasty) {
+ if (y > max_y) {
+ y = max_y;
+ yi = y >> 16;
+ src = src_uv + yi * src_stride;
+ }
+ if (yi != lasty) {
+ ScaleUVFilterCols(rowptr, src, dst_width, x, dx);
+ rowptr += rowstride;
+ rowstride = -rowstride;
+ lasty = yi;
+ src += src_stride;
+ }
+ }
+ if (filtering == kFilterLinear) {
+ InterpolateRow(dst_uv, rowptr, 0, dst_width * 2, 0);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(dst_uv, rowptr, rowstride, dst_width * 2, yf);
+ }
+ dst_uv += dst_stride;
+ y += dy;
+ }
+ free_aligned_buffer_64(row);
+ }
+}
+#endif // HAS_SCALEUVBILINEARUP
+
+// Scale UV to/from any dimensions, without interpolation.
+// Fixed point math is used for performance: The upper 16 bits
+// of x and dx is the integer part of the source position and
+// the lower 16 bits are the fixed decimal part.
+
+static void ScaleUVSimple(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_uv,
+ uint8_t* dst_uv,
+ int x,
+ int dx,
+ int y,
+ int dy) {
+ int j;
+ void (*ScaleUVCols)(uint8_t * dst_uv, const uint8_t* src_uv, int dst_width,
+ int x, int dx) =
+ (src_width >= 32768) ? ScaleUVCols64_C : ScaleUVCols_C;
+ (void)src_height;
+#if defined(HAS_SCALEUVCOLS_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleUVCols = ScaleUVCols_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEUVCOLS_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleUVCols = ScaleUVCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVCols = ScaleUVCols_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVCOLS_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ScaleUVCols = ScaleUVCols_Any_MMI;
+ if (IS_ALIGNED(dst_width, 1)) {
+ ScaleUVCols = ScaleUVCols_MMI;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVCOLS_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleUVCols = ScaleUVCols_Any_MSA;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVCols = ScaleUVCols_MSA;
+ }
+ }
+#endif
+ if (src_width * 2 == dst_width && x < 0x8000) {
+ ScaleUVCols = ScaleUVColsUp2_C;
+#if defined(HAS_SCALEUVCOLSUP2_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(dst_width, 8)) {
+ ScaleUVCols = ScaleUVColsUp2_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEUVCOLSUP2_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
+ ScaleUVCols = ScaleUVColsUp2_MMI;
+ }
+#endif
+ }
+
+ for (j = 0; j < dst_height; ++j) {
+ ScaleUVCols(dst_uv, src_uv + (y >> 16) * src_stride, dst_width, x, dx);
+ dst_uv += dst_stride;
+ y += dy;
+ }
+}
+
+// Copy UV with optional flipping
+#if HAS_UVCOPY
+static int UVCopy(const uint8_t* src_UV,
+ int src_stride_UV,
+ uint8_t* dst_UV,
+ int dst_stride_UV,
+ int width,
+ int height) {
+ if (!src_UV || !dst_UV || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_UV = src_UV + (height - 1) * src_stride_UV;
+ src_stride_UV = -src_stride_UV;
+ }
+
+ CopyPlane(src_UV, src_stride_UV, dst_UV, dst_stride_UV, width * 2, height);
+ return 0;
+}
+#endif // HAS_UVCOPY
+
+// Scale a UV plane (from NV12)
+// This function in turn calls a scaling function
+// suitable for handling the desired resolutions.
+static void ScaleUV(const uint8_t* src,
+ int src_stride,
+ int src_width,
+ int src_height,
+ uint8_t* dst,
+ int dst_stride,
+ int dst_width,
+ int dst_height,
+ int clip_x,
+ int clip_y,
+ int clip_width,
+ int clip_height,
+ enum FilterMode filtering) {
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ // UV does not support box filter yet, but allow the user to pass it.
+ // Simplify filtering when possible.
+ filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
+ filtering);
+
+ // Negative src_height means invert the image.
+ if (src_height < 0) {
+ src_height = -src_height;
+ src = src + (src_height - 1) * src_stride;
+ src_stride = -src_stride;
+ }
+ ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+ &dx, &dy);
+ src_width = Abs(src_width);
+ if (clip_x) {
+ int64_t clipf = (int64_t)(clip_x)*dx;
+ x += (clipf & 0xffff);
+ src += (clipf >> 16) * 2;
+ dst += clip_x * 2;
+ }
+ if (clip_y) {
+ int64_t clipf = (int64_t)(clip_y)*dy;
+ y += (clipf & 0xffff);
+ src += (clipf >> 16) * src_stride;
+ dst += clip_y * dst_stride;
+ }
+
+ // Special case for integer step values.
+ if (((dx | dy) & 0xffff) == 0) {
+ if (!dx || !dy) { // 1 pixel wide and/or tall.
+ filtering = kFilterNone;
+ } else {
+ // Optimized even scale down. ie 2, 4, 6, 8, 10x.
+ if (!(dx & 0x10000) && !(dy & 0x10000)) {
+#if HAS_SCALEUVDOWN2
+ if (dx == 0x20000) {
+ // Optimized 1/2 downsample.
+ ScaleUVDown2(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y, dy,
+ filtering);
+ return;
+ }
+#endif
+#if HAS_SCALEUVDOWN4BOX
+ if (dx == 0x40000 && filtering == kFilterBox) {
+ // Optimized 1/4 box downsample.
+ ScaleUVDown4Box(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y, dy);
+ return;
+ }
+#endif
+#if HAS_SCALEUVDOWNEVEN
+ ScaleUVDownEven(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y, dy,
+ filtering);
+ return;
+#endif
+ }
+ // Optimized odd scale down. ie 3, 5, 7, 9x.
+ if ((dx & 0x10000) && (dy & 0x10000)) {
+ filtering = kFilterNone;
+#ifdef HAS_UVCOPY
+ if (dx == 0x10000 && dy == 0x10000) {
+ // Straight copy.
+ UVCopy(src + (y >> 16) * src_stride + (x >> 16) * 2, src_stride, dst,
+ dst_stride, clip_width, clip_height);
+ return;
+ }
+#endif
+ }
+ }
+ }
+ // HAS_SCALEPLANEVERTICAL
+ if (dx == 0x10000 && (x & 0xffff) == 0) {
+ // Arbitrary scale vertically, but unscaled horizontally.
+ ScalePlaneVertical(src_height, clip_width, clip_height, src_stride,
+ dst_stride, src, dst, x, y, dy, 4, filtering);
+ return;
+ }
+
+#if HAS_SCALEUVBILINEARUP
+ if (filtering && dy < 65536) {
+ ScaleUVBilinearUp(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y, dy,
+ filtering);
+ return;
+ }
+#endif
+#if HAS_SCALEUVBILINEARDOWN
+ if (filtering) {
+ ScaleUVBilinearDown(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y, dy,
+ filtering);
+ return;
+ }
+#endif
+ ScaleUVSimple(src_width, src_height, clip_width, clip_height, src_stride,
+ dst_stride, src, dst, x, dx, y, dy);
+}
+
+// Scale an UV image.
+LIBYUV_API
+int UVScale(const uint8_t* src_uv,
+ int src_stride_uv,
+ int src_width,
+ int src_height,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ if (!src_uv || src_width == 0 || src_height == 0 || src_width > 32768 ||
+ src_height > 32768 || !dst_uv || dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+ ScaleUV(src_uv, src_stride_uv, src_width, src_height, dst_uv, dst_stride_uv,
+ dst_width, dst_height, 0, 0, dst_width, dst_height, filtering);
+ return 0;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/TMessagesProj/jni/voip/CMakeLists.txt b/TMessagesProj/jni/voip/CMakeLists.txt
index bd97ae2ac..3873854aa 100644
--- a/TMessagesProj/jni/voip/CMakeLists.txt
+++ b/TMessagesProj/jni/voip/CMakeLists.txt
@@ -221,6 +221,8 @@ add_library(tgcalls_tp STATIC
third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.c
third_party/libvpx/source/libvpx/vp9/encoder/vp9_tokenize.c
third_party/libvpx/source/libvpx/vp9/encoder/vp9_treewriter.c
+ third_party/libvpx/source/libvpx/vp9/encoder/vp9_ext_ratectrl.c
+ third_party/libvpx/source/libvpx/vp9/ratectrl_rtc.cc
third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.c
third_party/libvpx/source/libvpx/vp9/vp9_dx_iface.c
third_party/libvpx/source/libvpx/vp9/vp9_iface_common.c
@@ -738,6 +740,7 @@ add_library(tgcalls STATIC
voip/tgcalls/JsonConfig.cpp
voip/tgcalls/reference/InstanceImplReference.cpp
voip/tgcalls/legacy/InstanceImplLegacy.cpp
+ voip/tgcalls/group/GroupInstanceImpl.cpp
voip/webrtc/rtc_base/async_invoker.cc
voip/webrtc/rtc_base/async_packet_socket.cc
@@ -801,7 +804,6 @@ add_library(tgcalls STATIC
voip/webrtc/rtc_base/numerics/moving_average.cc
voip/webrtc/rtc_base/numerics/sample_counter.cc
voip/webrtc/rtc_base/numerics/sample_stats.cc
- voip/webrtc/rtc_base/numerics/samples_stats_counter.cc
voip/webrtc/rtc_base/openssl_adapter.cc
voip/webrtc/rtc_base/openssl_certificate.cc
voip/webrtc/rtc_base/openssl_digest.cc
@@ -837,8 +839,6 @@ add_library(tgcalls STATIC
voip/webrtc/rtc_base/strings/audio_format_to_string.cc
voip/webrtc/rtc_base/strings/string_builder.cc
voip/webrtc/rtc_base/strings/string_format.cc
- voip/webrtc/rtc_base/synchronization/rw_lock_posix.cc
- voip/webrtc/rtc_base/synchronization/rw_lock_wrapper.cc
voip/webrtc/rtc_base/synchronization/mutex.cc
voip/webrtc/rtc_base/synchronization/yield.cc
voip/webrtc/rtc_base/synchronization/sequence_checker.cc
@@ -860,6 +860,7 @@ add_library(tgcalls STATIC
voip/webrtc/rtc_base/unique_id_generator.cc
voip/webrtc/rtc_base/weak_ptr.cc
voip/webrtc/rtc_base/zero_memory.cc
+ voip/webrtc/rtc_base/callback_list.cc
voip/webrtc/rtc_base/deprecated/recursive_critical_section.cc
voip/webrtc/rtc_base/deprecated/signal_thread.cc
voip/webrtc/api/audio/audio_frame.cc
@@ -934,6 +935,7 @@ add_library(tgcalls STATIC
voip/webrtc/api/video/video_adaptation_counters.cc
voip/webrtc/api/video/video_frame_metadata.cc
voip/webrtc/api/voip/voip_engine_factory.cc
+ voip/webrtc/api/numerics/samples_stats_counter.cc
voip/webrtc/call/adaptation/adaptation_constraint.cc
voip/webrtc/call/adaptation/broadcast_resource_listener.cc
voip/webrtc/call/adaptation/degradation_preference_provider.cc
@@ -964,6 +966,7 @@ add_library(tgcalls STATIC
voip/webrtc/api/video/video_content_type.cc
voip/webrtc/api/video/video_frame.cc
voip/webrtc/api/video/video_frame_buffer.cc
+ voip/webrtc/api/video/nv12_buffer.cc
voip/webrtc/api/video/video_source_interface.cc
voip/webrtc/api/video/video_stream_decoder_create.cc
voip/webrtc/api/video/video_stream_encoder_create.cc
@@ -981,6 +984,7 @@ add_library(tgcalls STATIC
voip/webrtc/api/video_codecs/vp8_frame_config.cc
voip/webrtc/api/video_codecs/vp8_temporal_layers.cc
voip/webrtc/api/video_codecs/vp8_temporal_layers_factory.cc
+ voip/webrtc/api/video_codecs/spatial_layer.cc
voip/webrtc/pc/audio_rtp_receiver.cc
voip/webrtc/pc/audio_track.cc
voip/webrtc/pc/channel.cc
@@ -1037,6 +1041,12 @@ add_library(tgcalls STATIC
voip/webrtc/pc/video_track_source.cc
voip/webrtc/pc/webrtc_sdp.cc
voip/webrtc/pc/webrtc_session_description_factory.cc
+ voip/webrtc/pc/connection_context.cc
+ voip/webrtc/pc/peer_connection_message_handler.cc
+ voip/webrtc/pc/rtp_transmission_manager.cc
+ voip/webrtc/pc/sdp_offer_answer.cc
+ voip/webrtc/pc/transceiver_list.cc
+ voip/webrtc/pc/usage_pattern.cc
voip/webrtc/media/base/adapted_video_track_source.cc
voip/webrtc/media/base/codec.cc
voip/webrtc/media/base/h264_profile_level_id.cc
@@ -1176,9 +1186,6 @@ add_library(tgcalls STATIC
voip/webrtc/modules/audio_coding/codecs/opus/audio_encoder_multi_channel_opus_impl.cc
voip/webrtc/modules/audio_coding/codecs/opus/audio_encoder_opus.cc
voip/webrtc/modules/audio_coding/codecs/opus/opus_interface.cc
- voip/webrtc/modules/audio_coding/codecs/opus/test/audio_ring_buffer.cc
- voip/webrtc/modules/audio_coding/codecs/opus/test/blocker.cc
- voip/webrtc/modules/audio_coding/codecs/opus/test/lapped_transform.cc
voip/webrtc/modules/audio_coding/codecs/pcm16b/audio_decoder_pcm16b.cc
voip/webrtc/modules/audio_coding/codecs/pcm16b/audio_encoder_pcm16b.cc
voip/webrtc/modules/audio_coding/codecs/pcm16b/pcm16b_common.cc
@@ -1353,6 +1360,7 @@ add_library(tgcalls STATIC
voip/webrtc/modules/audio_processing/aec3/subtractor_output_analyzer.cc
voip/webrtc/modules/audio_processing/aec3/suppression_filter.cc
voip/webrtc/modules/audio_processing/aec3/suppression_gain.cc
+ voip/webrtc/modules/audio_processing/aec3/transparent_mode.cc
voip/webrtc/modules/audio_processing/aec_dump/null_aec_dump_factory.cc
voip/webrtc/modules/audio_processing/aecm/aecm_core.cc
voip/webrtc/modules/audio_processing/aecm/aecm_core_c.cc
@@ -1367,7 +1375,6 @@ add_library(tgcalls STATIC
voip/webrtc/modules/audio_processing/agc2/adaptive_digital_gain_applier.cc
voip/webrtc/modules/audio_processing/agc2/adaptive_mode_level_estimator.cc
voip/webrtc/modules/audio_processing/agc2/adaptive_mode_level_estimator_agc.cc
- voip/webrtc/modules/audio_processing/agc2/agc2_common.cc
voip/webrtc/modules/audio_processing/agc2/agc2_testing_common.cc
voip/webrtc/modules/audio_processing/agc2/biquad_filter.cc
voip/webrtc/modules/audio_processing/agc2/compute_interpolated_gain_curve.cc
@@ -1472,7 +1479,6 @@ add_library(tgcalls STATIC
voip/webrtc/modules/congestion_controller/goog_cc/robust_throughput_estimator.cc
voip/webrtc/modules/congestion_controller/goog_cc/send_side_bandwidth_estimation.cc
voip/webrtc/modules/congestion_controller/goog_cc/trendline_estimator.cc
- voip/webrtc/modules/include/module_common_types.cc
voip/webrtc/modules/pacing/bitrate_prober.cc
voip/webrtc/modules/pacing/interval_budget.cc
voip/webrtc/modules/pacing/paced_sender.cc
@@ -1575,6 +1581,7 @@ add_library(tgcalls STATIC
voip/webrtc/modules/rtp_rtcp/source/video_rtp_depacketizer_vp9.cc
voip/webrtc/modules/rtp_rtcp/source/receive_statistics_impl.cc
voip/webrtc/modules/rtp_rtcp/source/deprecated/deprecated_rtp_sender_egress.cc
+ voip/webrtc/modules/rtp_rtcp/source/rtp_video_layers_allocation_extension.cc
voip/webrtc/modules/utility/source/helpers_android.cc
voip/webrtc/modules/utility/source/jvm_android.cc
voip/webrtc/modules/utility/source/process_thread_impl.cc
@@ -1646,10 +1653,23 @@ add_library(tgcalls STATIC
voip/webrtc/modules/video_coding/codecs/vp8/screenshare_layers.cc
voip/webrtc/modules/video_coding/codecs/vp8/temporal_layers_checker.cc
voip/webrtc/modules/video_coding/codecs/vp9/svc_config.cc
- voip/webrtc/modules/video_coding/codecs/vp9/svc_rate_allocator.cc
voip/webrtc/modules/video_coding/codecs/vp9/vp9.cc
voip/webrtc/modules/video_coding/codecs/vp9/vp9_frame_buffer_pool.cc
voip/webrtc/modules/video_coding/codecs/vp9/vp9_impl.cc
+ voip/webrtc/modules/video_coding/svc/create_scalability_structure.cc
+ voip/webrtc/modules/video_coding/svc/scalability_structure_full_svc.cc
+ voip/webrtc/modules/video_coding/svc/scalability_structure_key_svc.cc
+ voip/webrtc/modules/video_coding/svc/scalability_structure_l1t2.cc
+ voip/webrtc/modules/video_coding/svc/scalability_structure_l1t3.cc
+ voip/webrtc/modules/video_coding/svc/scalability_structure_l2t1.cc
+ voip/webrtc/modules/video_coding/svc/scalability_structure_l2t1h.cc
+ voip/webrtc/modules/video_coding/svc/scalability_structure_l2t2.cc
+ voip/webrtc/modules/video_coding/svc/scalability_structure_l2t2_key_shift.cc
+ voip/webrtc/modules/video_coding/svc/scalability_structure_l3t1.cc
+ voip/webrtc/modules/video_coding/svc/scalability_structure_l3t3.cc
+ voip/webrtc/modules/video_coding/svc/scalability_structure_s2t1.cc
+ voip/webrtc/modules/video_coding/svc/scalable_video_controller_no_layering.cc
+ voip/webrtc/modules/video_coding/svc/svc_rate_allocator.cc
voip/webrtc/modules/video_processing/util/denoiser_filter.cc
voip/webrtc/modules/video_processing/util/denoiser_filter_c.cc
voip/webrtc/modules/video_processing/util/noise_estimation.cc
@@ -1751,11 +1771,11 @@ add_library(tgcalls STATIC
voip/webrtc/common_video/h265/h265_pps_parser.cc
voip/webrtc/common_video/h265/h265_sps_parser.cc
voip/webrtc/common_video/h265/h265_vps_parser.cc
- voip/webrtc/common_video/i420_buffer_pool.cc
voip/webrtc/common_video/incoming_video_stream.cc
voip/webrtc/common_video/libyuv/webrtc_libyuv.cc
voip/webrtc/common_video/video_frame_buffer.cc
voip/webrtc/common_video/video_render_frames.cc
+ voip/webrtc/common_video/video_frame_buffer_pool.cc
voip/webrtc/p2p/base/async_stun_tcp_socket.cc
voip/webrtc/p2p/base/basic_async_resolver_factory.cc
voip/webrtc/p2p/base/basic_ice_controller.cc
@@ -1792,6 +1812,7 @@ add_library(tgcalls STATIC
voip/webrtc/video/adaptation/quality_rampup_experiment_helper.cc
voip/webrtc/modules/video_coding/deprecated/nack_module.cc
voip/webrtc/modules/video_coding/nack_module2.cc
+ voip/webrtc/modules/async_audio_processing/async_audio_processing.cc
voip/webrtc/logging/rtc_event_log/encoder/blob_encoding.cc
voip/webrtc/logging/rtc_event_log/encoder/delta_encoding.cc
voip/webrtc/logging/rtc_event_log/encoder/rtc_event_log_encoder_common.cc
@@ -1830,6 +1851,8 @@ add_library(tgcalls STATIC
voip/webrtc/video/adaptation/quality_scaler_resource.cc
voip/webrtc/video/adaptation/video_stream_encoder_resource.cc
voip/webrtc/video/adaptation/video_stream_encoder_resource_manager.cc
+ voip/webrtc/video/adaptation/balanced_constraint.cc
+ voip/webrtc/video/adaptation/bitrate_constraint.cc
voip/webrtc/video/buffered_frame_decryptor.cc
voip/webrtc/video/call_stats.cc
voip/webrtc/video/encoder_bitrate_adjuster.cc
@@ -1864,6 +1887,7 @@ add_library(tgcalls STATIC
voip/webrtc/video/rtp_streams_synchronizer2.cc
voip/webrtc/video/receive_statistics_proxy2.cc
voip/webrtc/video/call_stats2.cc
+ voip/webrtc/video/alignment_adjuster.cc
voip/webrtc/audio/audio_level.cc
voip/webrtc/audio/audio_receive_stream.cc
voip/webrtc/audio/audio_send_stream.cc
diff --git a/TMessagesProj/jni/voip/org_telegram_messenger_voip_Instance.cpp b/TMessagesProj/jni/voip/org_telegram_messenger_voip_Instance.cpp
index 8a852a6b0..b1e6485fb 100644
--- a/TMessagesProj/jni/voip/org_telegram_messenger_voip_Instance.cpp
+++ b/TMessagesProj/jni/voip/org_telegram_messenger_voip_Instance.cpp
@@ -10,6 +10,9 @@
#include
#include
#include
+#include
+
+#include
#include "pc/video_track.h"
#include "legacy/InstanceImplLegacy.h"
@@ -69,12 +72,14 @@ public:
struct InstanceHolder {
std::unique_ptr nativeInstance;
+ std::unique_ptr groupNativeInstance;
jobject javaInstance;
std::shared_ptr _videoCapture;
std::shared_ptr _platformContext;
};
jclass TrafficStatsClass;
+jclass FingerprintClass;
jclass FinalStateClass;
jclass NativeInstanceClass;
jmethodID FinalStateInitMethod;
@@ -87,10 +92,6 @@ InstanceHolder *getInstanceHolder(JNIEnv *env, jobject obj) {
return reinterpret_cast(getInstanceHolderId(env, obj));
}
-Instance *getInstance(JNIEnv *env, jobject obj) {
- return getInstanceHolder(env, obj)->nativeInstance.get();
-}
-
jint throwNewJavaException(JNIEnv *env, const char *className, const char *message) {
return env->ThrowNew(env->FindClass(className), message);
}
@@ -220,6 +221,14 @@ jobject asJavaFinalState(JNIEnv *env, const FinalState &finalState) {
return env->NewObject(FinalStateClass, FinalStateInitMethod, persistentState, debugLog, trafficStats, isRatingSuggested);
}
+jobject asJavaFingerprint(JNIEnv *env, std::string hash, std::string setup, std::string fingerprint) {
+ jstring hashStr = env->NewStringUTF(hash.c_str());
+ jstring setupStr = env->NewStringUTF(setup.c_str());
+ jstring fingerprintStr = env->NewStringUTF(fingerprint.c_str());
+ jmethodID initMethodId = env->GetMethodID(FingerprintClass, "", "(Ljava/lang/String;Ljava/lang/String;Ljava/lang/String;)V");
+ return env->NewObject(FingerprintClass, initMethodId, hashStr, setupStr, fingerprintStr);
+}
+
extern "C" {
bool webrtcLoaded = false;
@@ -237,10 +246,129 @@ void initWebRTC(JNIEnv *env) {
NativeInstanceClass = static_cast(env->NewGlobalRef(env->FindClass("org/telegram/messenger/voip/NativeInstance")));
TrafficStatsClass = static_cast(env->NewGlobalRef(env->FindClass("org/telegram/messenger/voip/Instance$TrafficStats")));
+ FingerprintClass = static_cast(env->NewGlobalRef(env->FindClass("org/telegram/messenger/voip/Instance$Fingerprint")));
FinalStateClass = static_cast(env->NewGlobalRef(env->FindClass("org/telegram/messenger/voip/Instance$FinalState")));
FinalStateInitMethod = env->GetMethodID(FinalStateClass, "", "([BLjava/lang/String;Lorg/telegram/messenger/voip/Instance$TrafficStats;Z)V");
}
+JNIEXPORT jlong JNICALL Java_org_telegram_messenger_voip_NativeInstance_makeGroupNativeInstance(JNIEnv *env, jclass clazz, jobject instanceObj, jboolean highQuality) {
+ initWebRTC(env);
+
+ jobject globalRef = env->NewGlobalRef(instanceObj);
+ std::shared_ptr platformContext = std::make_shared(env);
+
+ GroupInstanceDescriptor descriptor = {
+ .networkStateUpdated = [globalRef](bool state) {
+ tgvoip::jni::DoWithJNI([globalRef, state](JNIEnv *env) {
+ env->CallVoidMethod(globalRef, env->GetMethodID(NativeInstanceClass, "onNetworkStateUpdated", "(Z)V"), state);
+ });
+ },
+ .audioLevelsUpdated = [globalRef](GroupLevelsUpdate const &update) {
+ tgvoip::jni::DoWithJNI([globalRef, update](JNIEnv *env) {
+ unsigned int size = update.updates.size();
+ jintArray intArray = env->NewIntArray(size);
+ jfloatArray floatArray = env->NewFloatArray(size);
+ jbooleanArray boolArray = env->NewBooleanArray(size);
+
+ jint intFill[size];
+ jfloat floatFill[size];
+ jboolean boolFill[size];
+ for (int a = 0; a < size; a++) {
+ intFill[a] = update.updates[a].ssrc;
+ floatFill[a] = update.updates[a].value.level;
+ boolFill[a] = update.updates[a].value.voice;
+ }
+ env->SetIntArrayRegion(intArray, 0, size, intFill);
+ env->SetFloatArrayRegion(floatArray, 0, size, floatFill);
+ env->SetBooleanArrayRegion(boolArray, 0, size, boolFill);
+
+ env->CallVoidMethod(globalRef, env->GetMethodID(NativeInstanceClass, "onAudioLevelsUpdated", "([I[F[Z)V"), intArray, floatArray, boolArray);
+ env->DeleteLocalRef(intArray);
+ env->DeleteLocalRef(floatArray);
+ env->DeleteLocalRef(boolArray);
+ });
+ },
+ .platformContext = platformContext
+ };
+
+ auto *holder = new InstanceHolder;
+ holder->groupNativeInstance = std::make_unique(std::move(descriptor));
+ holder->javaInstance = globalRef;
+ holder->_platformContext = platformContext;
+ holder->groupNativeInstance->emitJoinPayload([globalRef](const GroupJoinPayload& payload) {
+ JNIEnv *env = webrtc::AttachCurrentThreadIfNeeded();
+ jobjectArray array = env->NewObjectArray(payload.fingerprints.size(), FingerprintClass, 0);
+ for (int a = 0; a < payload.fingerprints.size(); a++) {
+ env->SetObjectArrayElement(array, a, asJavaFingerprint(env, payload.fingerprints[a].hash, payload.fingerprints[a].setup, payload.fingerprints[a].fingerprint));
+ }
+ env->CallVoidMethod(globalRef, env->GetMethodID(NativeInstanceClass, "onEmitJoinPayload", "(Ljava/lang/String;Ljava/lang/String;[Lorg/telegram/messenger/voip/Instance$Fingerprint;I)V"), env->NewStringUTF(payload.ufrag.c_str()), env->NewStringUTF(payload.pwd.c_str()), array, (jint) payload.ssrc);
+ });
+ return reinterpret_cast(holder);
+}
+
+JNIEXPORT void JNICALL Java_org_telegram_messenger_voip_NativeInstance_setJoinResponsePayload(JNIEnv *env, jobject obj, jstring ufrag, jstring pwd, jobjectArray fingerprints, jobjectArray candidates) {
+ InstanceHolder *instance = getInstanceHolder(env, obj);
+ if (instance->groupNativeInstance == nullptr) {
+ return;
+ }
+ std::vector fingerprintsArray;
+ std::vector candidatesArray;
+
+ jsize size = env->GetArrayLength(fingerprints);
+ for (int i = 0; i < size; i++) {
+ JavaObject fingerprintObject(env, env->GetObjectArrayElement(fingerprints, i));
+ fingerprintsArray.push_back(
+ {
+ .hash = tgvoip::jni::JavaStringToStdString(env, fingerprintObject.getStringField("hash")),
+ .setup = tgvoip::jni::JavaStringToStdString(env, fingerprintObject.getStringField("setup")),
+ .fingerprint = tgvoip::jni::JavaStringToStdString(env, fingerprintObject.getStringField("fingerprint"))
+ });
+ }
+ size = env->GetArrayLength(candidates);
+ for (int i = 0; i < size; i++) {
+ JavaObject candidateObject(env, env->GetObjectArrayElement(candidates, i));
+ candidatesArray.push_back(
+ {
+ .port = tgvoip::jni::JavaStringToStdString(env, candidateObject.getStringField("port")),
+ .protocol = tgvoip::jni::JavaStringToStdString(env, candidateObject.getStringField("protocol")),
+ .network = tgvoip::jni::JavaStringToStdString(env, candidateObject.getStringField("network")),
+ .generation = tgvoip::jni::JavaStringToStdString(env, candidateObject.getStringField("generation")),
+ .id = tgvoip::jni::JavaStringToStdString(env, candidateObject.getStringField("id")),
+ .component = tgvoip::jni::JavaStringToStdString(env, candidateObject.getStringField("component")),
+ .foundation = tgvoip::jni::JavaStringToStdString(env, candidateObject.getStringField("foundation")),
+ .priority = tgvoip::jni::JavaStringToStdString(env, candidateObject.getStringField("priority")),
+ .ip = tgvoip::jni::JavaStringToStdString(env, candidateObject.getStringField("ip")),
+ .type = tgvoip::jni::JavaStringToStdString(env, candidateObject.getStringField("type")),
+ .tcpType = tgvoip::jni::JavaStringToStdString(env, candidateObject.getStringField("tcpType")),
+ .relAddr = tgvoip::jni::JavaStringToStdString(env, candidateObject.getStringField("relAddr")),
+ .relPort = tgvoip::jni::JavaStringToStdString(env, candidateObject.getStringField("relPort")),
+ });
+ }
+
+ instance->groupNativeInstance->setJoinResponsePayload(
+ {
+ .ufrag = tgvoip::jni::JavaStringToStdString(env, ufrag),
+ .pwd = tgvoip::jni::JavaStringToStdString(env, pwd),
+ .fingerprints = fingerprintsArray,
+ .candidates = candidatesArray,
+ });
+}
+
+JNIEXPORT void JNICALL Java_org_telegram_messenger_voip_NativeInstance_removeSsrcs(JNIEnv *env, jobject obj, jintArray ssrcs) {
+ InstanceHolder *instance = getInstanceHolder(env, obj);
+ if (instance->groupNativeInstance == nullptr) {
+ return;
+ }
+ jsize size = env->GetArrayLength(ssrcs);
+
+ std::vector ssrcsArray;
+ ssrcsArray.resize(size);
+ for (int i = 0; i < size; i++) {
+ env->GetIntArrayRegion(ssrcs, 0, size, reinterpret_cast(ssrcsArray.data()));
+ }
+ instance->groupNativeInstance->removeSsrcs(ssrcsArray);
+}
+
JNIEXPORT jlong JNICALL Java_org_telegram_messenger_voip_NativeInstance_makeNativeInstance(JNIEnv *env, jclass clazz, jstring version, jobject instanceObj, jobject config, jstring persistentStateFilePath, jobjectArray endpoints, jobject proxyClass, jint networkType, jobject encryptionKey, jobject remoteSink, jlong videoCapturer, jfloat aspectRatio) {
initWebRTC(env);
@@ -369,43 +497,83 @@ JNIEXPORT void JNICALL Java_org_telegram_messenger_voip_NativeInstance_setBuffer
}
JNIEXPORT void JNICALL Java_org_telegram_messenger_voip_NativeInstance_setNetworkType(JNIEnv *env, jobject obj, jint networkType) {
- getInstance(env, obj)->setNetworkType(parseNetworkType(networkType));
+ InstanceHolder *instance = getInstanceHolder(env, obj);
+ if (instance->nativeInstance == nullptr) {
+ return;
+ }
+ instance->nativeInstance->setNetworkType(parseNetworkType(networkType));
}
JNIEXPORT void JNICALL Java_org_telegram_messenger_voip_NativeInstance_setMuteMicrophone(JNIEnv *env, jobject obj, jboolean muteMicrophone) {
- getInstance(env, obj)->setMuteMicrophone(muteMicrophone);
+ InstanceHolder *instance = getInstanceHolder(env, obj);
+ if (instance->nativeInstance != nullptr) {
+ instance->nativeInstance->setMuteMicrophone(muteMicrophone);
+ } else if (instance->groupNativeInstance != nullptr) {
+ instance->groupNativeInstance->setIsMuted(muteMicrophone);
+ }
}
JNIEXPORT void JNICALL Java_org_telegram_messenger_voip_NativeInstance_setAudioOutputGainControlEnabled(JNIEnv *env, jobject obj, jboolean enabled) {
- getInstance(env, obj)->setAudioOutputGainControlEnabled(enabled);
+ InstanceHolder *instance = getInstanceHolder(env, obj);
+ if (instance->nativeInstance == nullptr) {
+ return;
+ }
+ instance->nativeInstance->setAudioOutputGainControlEnabled(enabled);
}
JNIEXPORT void JNICALL Java_org_telegram_messenger_voip_NativeInstance_setEchoCancellationStrength(JNIEnv *env, jobject obj, jint strength) {
- getInstance(env, obj)->setEchoCancellationStrength(strength);
+ InstanceHolder *instance = getInstanceHolder(env, obj);
+ if (instance->nativeInstance == nullptr) {
+ return;
+ }
+ instance->nativeInstance->setEchoCancellationStrength(strength);
}
JNIEXPORT jstring JNICALL Java_org_telegram_messenger_voip_NativeInstance_getLastError(JNIEnv *env, jobject obj) {
- return env->NewStringUTF(getInstance(env, obj)->getLastError().c_str());
+ InstanceHolder *instance = getInstanceHolder(env, obj);
+ if (instance->nativeInstance == nullptr) {
+ return nullptr;
+ }
+ return env->NewStringUTF(instance->nativeInstance->getLastError().c_str());
}
JNIEXPORT jstring JNICALL Java_org_telegram_messenger_voip_NativeInstance_getDebugInfo(JNIEnv *env, jobject obj) {
- return env->NewStringUTF(getInstance(env, obj)->getDebugInfo().c_str());
+ InstanceHolder *instance = getInstanceHolder(env, obj);
+ if (instance->nativeInstance == nullptr) {
+ return nullptr;
+ }
+ return env->NewStringUTF(instance->nativeInstance->getDebugInfo().c_str());
}
JNIEXPORT jlong JNICALL Java_org_telegram_messenger_voip_NativeInstance_getPreferredRelayId(JNIEnv *env, jobject obj) {
- return getInstance(env, obj)->getPreferredRelayId();
+ InstanceHolder *instance = getInstanceHolder(env, obj);
+ if (instance->nativeInstance == nullptr) {
+ return 0;
+ }
+ return instance->nativeInstance->getPreferredRelayId();
}
JNIEXPORT jobject JNICALL Java_org_telegram_messenger_voip_NativeInstance_getTrafficStats(JNIEnv *env, jobject obj) {
- return asJavaTrafficStats(env, getInstance(env, obj)->getTrafficStats());
+ InstanceHolder *instance = getInstanceHolder(env, obj);
+ if (instance->nativeInstance == nullptr) {
+ return nullptr;
+ }
+ return asJavaTrafficStats(env, instance->nativeInstance->getTrafficStats());
}
JNIEXPORT jbyteArray JNICALL Java_org_telegram_messenger_voip_NativeInstance_getPersistentState(JNIEnv *env, jobject obj) {
- return copyVectorToJavaByteArray(env, getInstance(env, obj)->getPersistentState().value);
+ InstanceHolder *instance = getInstanceHolder(env, obj);
+ if (instance->nativeInstance == nullptr) {
+ return nullptr;
+ }
+ return copyVectorToJavaByteArray(env, instance->nativeInstance->getPersistentState().value);
}
JNIEXPORT void JNICALL Java_org_telegram_messenger_voip_NativeInstance_stopNative(JNIEnv *env, jobject obj) {
InstanceHolder *instance = getInstanceHolder(env, obj);
+ if (instance->nativeInstance == nullptr) {
+ return;
+ }
instance->nativeInstance->stop([instance](FinalState finalState) {
JNIEnv *env = webrtc::AttachCurrentThreadIfNeeded();
const std::string &path = tgvoip::jni::JavaStringToStdString(env, JavaObject(env, instance->javaInstance).getStringField("persistentStateFilePath"));
@@ -416,6 +584,17 @@ JNIEXPORT void JNICALL Java_org_telegram_messenger_voip_NativeInstance_stopNativ
});
}
+JNIEXPORT void JNICALL Java_org_telegram_messenger_voip_NativeInstance_stopGroupNative(JNIEnv *env, jobject obj) {
+ InstanceHolder *instance = getInstanceHolder(env, obj);
+ if (instance->groupNativeInstance == nullptr) {
+ return;
+ }
+ instance->groupNativeInstance->stop();
+ instance->groupNativeInstance.reset();
+ env->DeleteGlobalRef(instance->javaInstance);
+ delete instance;
+}
+
JNIEXPORT jlong JNICALL Java_org_telegram_messenger_voip_NativeInstance_createVideoCapturer(JNIEnv *env, jclass clazz, jobject localSink, jboolean front) {
initWebRTC(env);
std::unique_ptr capture = tgcalls::VideoCaptureInterface::Create(front ? "front" : "back", std::make_shared(env));
@@ -441,6 +620,9 @@ JNIEXPORT void JNICALL Java_org_telegram_messenger_voip_NativeInstance_setVideoS
JNIEXPORT void JNICALL Java_org_telegram_messenger_voip_NativeInstance_switchCamera(JNIEnv *env, jobject obj, jboolean front) {
InstanceHolder *instance = getInstanceHolder(env, obj);
+ if (instance->nativeInstance == nullptr) {
+ return;
+ }
if (instance->_videoCapture == nullptr) {
return;
}
@@ -449,6 +631,9 @@ JNIEXPORT void JNICALL Java_org_telegram_messenger_voip_NativeInstance_switchCam
JNIEXPORT void Java_org_telegram_messenger_voip_NativeInstance_setVideoState(JNIEnv *env, jobject obj, jint state) {
InstanceHolder *instance = getInstanceHolder(env, obj);
+ if (instance->nativeInstance == nullptr) {
+ return;
+ }
if (instance->_videoCapture == nullptr) {
return;
}
@@ -457,6 +642,9 @@ JNIEXPORT void Java_org_telegram_messenger_voip_NativeInstance_setVideoState(JNI
JNIEXPORT void JNICALL Java_org_telegram_messenger_voip_NativeInstance_setupOutgoingVideo(JNIEnv *env, jobject obj, jobject localSink, jboolean front) {
InstanceHolder *instance = getInstanceHolder(env, obj);
+ if (instance->nativeInstance == nullptr) {
+ return;
+ }
if (instance->_videoCapture) {
return;
}
@@ -468,6 +656,9 @@ JNIEXPORT void JNICALL Java_org_telegram_messenger_voip_NativeInstance_setupOutg
JNIEXPORT void JNICALL Java_org_telegram_messenger_voip_NativeInstance_onSignalingDataReceive(JNIEnv *env, jobject obj, jbyteArray value) {
InstanceHolder *instance = getInstanceHolder(env, obj);
+ if (instance->nativeInstance == nullptr) {
+ return;
+ }
auto *valueBytes = (uint8_t *) env->GetByteArrayElements(value, nullptr);
const size_t size = env->GetArrayLength(value);
@@ -477,5 +668,4 @@ JNIEXPORT void JNICALL Java_org_telegram_messenger_voip_NativeInstance_onSignali
env->ReleaseByteArrayElements(value, (jbyte *) valueBytes, JNI_ABORT);
}
-
}
\ No newline at end of file
diff --git a/TMessagesProj/jni/voip/tgcalls/CryptoHelper.cpp b/TMessagesProj/jni/voip/tgcalls/CryptoHelper.cpp
index 67603734e..a9c8cd3af 100644
--- a/TMessagesProj/jni/voip/tgcalls/CryptoHelper.cpp
+++ b/TMessagesProj/jni/voip/tgcalls/CryptoHelper.cpp
@@ -1,6 +1,7 @@
#include "CryptoHelper.h"
#include
+#include
namespace tgcalls {
diff --git a/TMessagesProj/jni/voip/tgcalls/Instance.h b/TMessagesProj/jni/voip/tgcalls/Instance.h
index 6a48907cd..f91c270a1 100644
--- a/TMessagesProj/jni/voip/tgcalls/Instance.h
+++ b/TMessagesProj/jni/voip/tgcalls/Instance.h
@@ -23,11 +23,13 @@ namespace tgcalls {
class VideoCaptureInterface;
class PlatformContext;
+struct FilePath {
#ifndef _WIN32
-using FilePath = std::string;
+ std::string data;
#else
-using FilePath = std::wstring;
+ std::wstring data;
#endif
+};
struct Proxy {
std::string host;
@@ -217,6 +219,7 @@ struct Descriptor {
std::shared_ptr videoCapture;
std::function stateUpdated;
std::function signalBarsUpdated;
+ std::function audioLevelUpdated;
std::function remoteBatteryLevelIsLowUpdated;
std::function remoteMediaStateUpdated;
std::function remotePrefferedAspectRatioUpdated;
diff --git a/TMessagesProj/jni/voip/tgcalls/InstanceImpl.cpp b/TMessagesProj/jni/voip/tgcalls/InstanceImpl.cpp
index 0abdecdd6..508954928 100644
--- a/TMessagesProj/jni/voip/tgcalls/InstanceImpl.cpp
+++ b/TMessagesProj/jni/voip/tgcalls/InstanceImpl.cpp
@@ -25,11 +25,11 @@ rtc::Thread *getManagerThread() {
} // namespace
InstanceImpl::InstanceImpl(Descriptor &&descriptor)
-: _logSink(std::make_unique(descriptor.config)) {
+: _logSink(std::make_unique(descriptor.config.logPath)) {
rtc::LogMessage::LogToDebug(rtc::LS_INFO);
rtc::LogMessage::SetLogToStderr(false);
rtc::LogMessage::AddLogToStream(_logSink.get(), rtc::LS_INFO);
-
+
auto networkType = descriptor.initialNetworkType;
_manager.reset(new ThreadLocalObject(getManagerThread(), [descriptor = std::move(descriptor)]() mutable {
@@ -38,7 +38,7 @@ InstanceImpl::InstanceImpl(Descriptor &&descriptor)
_manager->perform(RTC_FROM_HERE, [](Manager *manager) {
manager->start();
});
-
+
setNetworkType(networkType);
}
@@ -74,7 +74,7 @@ void InstanceImpl::setNetworkType(NetworkType networkType) {
default:
break;
}
-
+
_manager->perform(RTC_FROM_HERE, [isLowCostNetwork](Manager *manager) {
manager->setIsLocalNetworkLowCost(isLowCostNetwork);
});
@@ -154,7 +154,7 @@ PersistentState InstanceImpl::getPersistentState() {
void InstanceImpl::stop(std::function completion) {
std::string debugLog = _logSink->result();
-
+
_manager->perform(RTC_FROM_HERE, [completion, debugLog = std::move(debugLog)](Manager *manager) {
manager->getNetworkStats([completion, debugLog = std::move(debugLog)](TrafficStats stats, CallStats callStats) {
FinalState finalState;
@@ -162,7 +162,7 @@ void InstanceImpl::stop(std::function completion) {
finalState.isRatingSuggested = false;
finalState.trafficStats = stats;
finalState.callStats = callStats;
-
+
completion(finalState);
});
});
diff --git a/TMessagesProj/jni/voip/tgcalls/LogSinkImpl.cpp b/TMessagesProj/jni/voip/tgcalls/LogSinkImpl.cpp
index 77755c319..b7e53e07f 100644
--- a/TMessagesProj/jni/voip/tgcalls/LogSinkImpl.cpp
+++ b/TMessagesProj/jni/voip/tgcalls/LogSinkImpl.cpp
@@ -11,9 +11,9 @@
namespace tgcalls {
-LogSinkImpl::LogSinkImpl(const Config &config) {
- if (!config.logPath.empty()) {
- _file.open(config.logPath);
+LogSinkImpl::LogSinkImpl(const FilePath &logPath) {
+ if (!logPath.data.empty()) {
+ _file.open(logPath.data);
}
}
@@ -64,7 +64,7 @@ void LogSinkImpl::OnLogMessage(const std::string &message) {
<< ":" << timeinfo.tm_sec
<< ":" << milliseconds
<< " " << message;
-
+
#if DEBUG
printf("%d-%d-%d %d:%d:%d:%d %s\n", timeinfo.tm_year + 1900, timeinfo.tm_mon + 1, timeinfo.tm_mday, timeinfo.tm_hour, timeinfo.tm_min, timeinfo.tm_sec, milliseconds, message.c_str());
#endif
diff --git a/TMessagesProj/jni/voip/tgcalls/LogSinkImpl.h b/TMessagesProj/jni/voip/tgcalls/LogSinkImpl.h
index 5c6aa1d11..5d480ffb0 100644
--- a/TMessagesProj/jni/voip/tgcalls/LogSinkImpl.h
+++ b/TMessagesProj/jni/voip/tgcalls/LogSinkImpl.h
@@ -6,11 +6,11 @@
namespace tgcalls {
-struct Config;
+struct FilePath;
class LogSinkImpl final : public rtc::LogSink {
public:
- LogSinkImpl(const Config &config);
+ LogSinkImpl(const FilePath &logPath);
void OnLogMessage(const std::string &msg, rtc::LoggingSeverity severity, const char *tag) override;
void OnLogMessage(const std::string &message, rtc::LoggingSeverity severity) override;
diff --git a/TMessagesProj/jni/voip/tgcalls/Manager.cpp b/TMessagesProj/jni/voip/tgcalls/Manager.cpp
index affc0c346..a789d088c 100644
--- a/TMessagesProj/jni/voip/tgcalls/Manager.cpp
+++ b/TMessagesProj/jni/voip/tgcalls/Manager.cpp
@@ -27,11 +27,11 @@ rtc::Thread *makeMediaThread() {
}
void dumpStatsLog(const FilePath &path, const CallStats &stats) {
- if (path.empty()) {
+ if (path.data.empty()) {
return;
}
std::ofstream file;
- file.open(path);
+ file.open(path.data);
file << "{";
file << "\"v\":\"" << 1 << "\"";
@@ -112,6 +112,7 @@ _enableStunMarking(descriptor.config.enableStunMarking),
_protocolVersion(descriptor.config.protocolVersion),
_statsLogPath(descriptor.config.statsLogPath),
_rtcServers(std::move(descriptor.rtcServers)),
+_proxy(std::move(descriptor.proxy)),
_mediaDevicesConfig(std::move(descriptor.mediaDevicesConfig)),
_videoCapture(std::move(descriptor.videoCapture)),
_stateUpdated(std::move(descriptor.stateUpdated)),
@@ -120,6 +121,7 @@ _remoteBatteryLevelIsLowUpdated(std::move(descriptor.remoteBatteryLevelIsLowUpda
_remotePrefferedAspectRatioUpdated(std::move(descriptor.remotePrefferedAspectRatioUpdated)),
_signalingDataEmitted(std::move(descriptor.signalingDataEmitted)),
_signalBarsUpdated(std::move(descriptor.signalBarsUpdated)),
+_audioLevelUpdated(std::move(descriptor.audioLevelUpdated)),
_enableHighBitrateVideo(descriptor.config.enableHighBitrateVideo),
_dataSaving(descriptor.config.dataSaving),
_platformContext(descriptor.platformContext) {
@@ -177,7 +179,7 @@ void Manager::start() {
strong->_sendSignalingMessage(std::move(message));
});
};
- _networkManager.reset(new ThreadLocalObject(getNetworkThread(), [weak, thread, sendSignalingMessage, encryptionKey = _encryptionKey, enableP2P = _enableP2P, enableTCP = _enableTCP, enableStunMarking = _enableStunMarking, rtcServers = _rtcServers] {
+ _networkManager.reset(new ThreadLocalObject(getNetworkThread(), [weak, thread, sendSignalingMessage, encryptionKey = _encryptionKey, enableP2P = _enableP2P, enableTCP = _enableTCP, enableStunMarking = _enableStunMarking, rtcServers = _rtcServers, proxy = std::move(_proxy)] () mutable {
return new NetworkManager(
getNetworkThread(),
encryptionKey,
@@ -185,6 +187,7 @@ void Manager::start() {
enableTCP,
enableStunMarking,
rtcServers,
+ std::move(proxy),
[=](const NetworkManager::State &state) {
thread->PostTask(RTC_FROM_HERE, [=] {
const auto strong = weak.lock();
@@ -242,7 +245,7 @@ void Manager::start() {
});
}));
bool isOutgoing = _encryptionKey.isOutgoing;
- _mediaManager.reset(new ThreadLocalObject(getMediaThread(), [weak, isOutgoing, protocolVersion = _protocolVersion, thread, sendSignalingMessage, videoCapture = _videoCapture, mediaDevicesConfig = _mediaDevicesConfig, enableHighBitrateVideo = _enableHighBitrateVideo, signalBarsUpdated = _signalBarsUpdated, preferredCodecs = _preferredCodecs, platformContext = _platformContext]() {
+ _mediaManager.reset(new ThreadLocalObject(getMediaThread(), [weak, isOutgoing, protocolVersion = _protocolVersion, thread, sendSignalingMessage, videoCapture = _videoCapture, mediaDevicesConfig = _mediaDevicesConfig, enableHighBitrateVideo = _enableHighBitrateVideo, signalBarsUpdated = _signalBarsUpdated, audioLevelUpdated = _audioLevelUpdated, preferredCodecs = _preferredCodecs, platformContext = _platformContext]() {
return new MediaManager(
getMediaThread(),
isOutgoing,
@@ -260,6 +263,7 @@ void Manager::start() {
});
},
signalBarsUpdated,
+ audioLevelUpdated,
enableHighBitrateVideo,
preferredCodecs,
platformContext);
diff --git a/TMessagesProj/jni/voip/tgcalls/Manager.h b/TMessagesProj/jni/voip/tgcalls/Manager.h
index bc720fbb9..621936c1a 100644
--- a/TMessagesProj/jni/voip/tgcalls/Manager.h
+++ b/TMessagesProj/jni/voip/tgcalls/Manager.h
@@ -57,6 +57,7 @@ private:
ProtocolVersion _protocolVersion = ProtocolVersion::V0;
FilePath _statsLogPath;
std::vector _rtcServers;
+ std::unique_ptr _proxy;
MediaDevicesConfig _mediaDevicesConfig;
std::shared_ptr _videoCapture;
std::function _stateUpdated;
@@ -65,6 +66,7 @@ private:
std::function _remotePrefferedAspectRatioUpdated;
std::function &)> _signalingDataEmitted;
std::function _signalBarsUpdated;
+ std::function _audioLevelUpdated;
std::function _sendSignalingMessage;
std::function _sendTransportMessage;
std::unique_ptr> _networkManager;
diff --git a/TMessagesProj/jni/voip/tgcalls/MediaManager.cpp b/TMessagesProj/jni/voip/tgcalls/MediaManager.cpp
index 1a7eed12b..64377fcbb 100644
--- a/TMessagesProj/jni/voip/tgcalls/MediaManager.cpp
+++ b/TMessagesProj/jni/voip/tgcalls/MediaManager.cpp
@@ -17,6 +17,8 @@
#include "api/video/builtin_video_bitrate_allocator_factory.h"
#include "call/call.h"
#include "modules/rtp_rtcp/source/rtp_utility.h"
+#include "api/call/audio_sink.h"
+#include "modules/audio_processing/audio_buffer.h"
namespace tgcalls {
namespace {
@@ -82,6 +84,47 @@ private:
};
+class AudioTrackSinkInterfaceImpl: public webrtc::AudioSinkInterface {
+private:
+ std::function _update;
+
+ int _peakCount = 0;
+ uint16_t _peak = 0;
+
+public:
+ AudioTrackSinkInterfaceImpl(std::function update) :
+ _update(update) {
+ }
+
+ virtual ~AudioTrackSinkInterfaceImpl() {
+ }
+
+ virtual void OnData(const Data& audio) override {
+ if (audio.channels == 1) {
+ int16_t *samples = (int16_t *)audio.data;
+ int numberOfSamplesInFrame = (int)audio.samples_per_channel;
+
+ for (int i = 0; i < numberOfSamplesInFrame; i++) {
+ int16_t sample = samples[i];
+ if (sample < 0) {
+ sample = -sample;
+ }
+ if (_peak < sample) {
+ _peak = sample;
+ }
+ _peakCount += 1;
+ }
+
+ if (_peakCount >= 1200) {
+ float level = ((float)(_peak)) / 4000.0f;
+ _peak = 0;
+ _peakCount = 0;
+ _update(level);
+ }
+ }
+ }
+};
+
rtc::Thread *MediaManager::getWorkerThread() {
static rtc::Thread *value = makeWorkerThread();
return value;
@@ -96,6 +139,7 @@ MediaManager::MediaManager(
std::function sendSignalingMessage,
std::function sendTransportMessage,
std::function signalBarsUpdated,
+ std::function audioLevelUpdated,
bool enableHighBitrateVideo,
std::vector preferredCodecs,
std::shared_ptr platformContext) :
@@ -105,6 +149,7 @@ _taskQueueFactory(webrtc::CreateDefaultTaskQueueFactory()),
_sendSignalingMessage(std::move(sendSignalingMessage)),
_sendTransportMessage(std::move(sendTransportMessage)),
_signalBarsUpdated(std::move(signalBarsUpdated)),
+_audioLevelUpdated(std::move(audioLevelUpdated)),
_protocolVersion(protocolVersion),
_outgoingVideoState(videoCapture ? VideoState::Active : VideoState::Inactive),
_videoCapture(std::move(videoCapture)),
@@ -264,6 +309,27 @@ rtc::scoped_refptr MediaManager::createAudioDeviceMod
}
void MediaManager::start() {
+ const auto weak = std::weak_ptr(shared_from_this());
+
+ // Here we hope that thread outlives the sink
+ rtc::Thread *thread = _thread;
+ std::unique_ptr incomingSink(new AudioTrackSinkInterfaceImpl([weak, thread](float level) {
+ thread->PostTask(RTC_FROM_HERE, [weak, level] {
+ if (const auto strong = weak.lock()) {
+ strong->_currentAudioLevel = level;
+ }
+ });
+ }));
+ std::unique_ptr outgoingSink(new AudioTrackSinkInterfaceImpl([weak, thread](float level) {
+ thread->PostTask(RTC_FROM_HERE, [weak, level] {
+ if (const auto strong = weak.lock()) {
+ strong->_currentMyAudioLevel = level;
+ }
+ });
+ }));
+ _audioChannel->SetRawAudioSink(_ssrcAudio.incoming, std::move(incomingSink));
+ _audioChannel->SetRawAudioSink(_ssrcAudio.outgoing, std::move(outgoingSink));
+
_sendSignalingMessage({ _myVideoFormats });
if (_videoCapture != nullptr) {
@@ -271,6 +337,9 @@ void MediaManager::start() {
}
beginStatsTimer(3000);
+ if (_audioLevelUpdated != nullptr) {
+ beginLevelsTimer(50);
+ }
}
MediaManager::~MediaManager() {
@@ -369,6 +438,21 @@ void MediaManager::beginStatsTimer(int timeoutMs) {
}, timeoutMs);
}
+void MediaManager::beginLevelsTimer(int timeoutMs) {
+ const auto weak = std::weak_ptr(shared_from_this());
+ _thread->PostDelayedTask(RTC_FROM_HERE, [weak]() {
+ auto strong = weak.lock();
+ if (!strong) {
+ return;
+ }
+
+ float effectiveLevel = fmaxf(strong->_currentAudioLevel, strong->_currentMyAudioLevel);
+ strong->_audioLevelUpdated(effectiveLevel);
+
+ strong->beginLevelsTimer(50);
+ }, timeoutMs);
+}
+
void MediaManager::collectStats() {
auto stats = _call->GetStats();
float bitrateNorm = 16.0f;
diff --git a/TMessagesProj/jni/voip/tgcalls/MediaManager.h b/TMessagesProj/jni/voip/tgcalls/MediaManager.h
index 1bd372a33..154a7f2cf 100644
--- a/TMessagesProj/jni/voip/tgcalls/MediaManager.h
+++ b/TMessagesProj/jni/voip/tgcalls/MediaManager.h
@@ -47,6 +47,7 @@ public:
std::function sendSignalingMessage,
std::function sendTransportMessage,
std::function signalBarsUpdated,
+ std::function audioLevelUpdated,
bool enableHighBitrateVideo,
std::vector preferredCodecs,
std::shared_ptr platformContext);
@@ -115,6 +116,7 @@ private:
rtc::scoped_refptr createAudioDeviceModule();
void beginStatsTimer(int timeoutMs);
+ void beginLevelsTimer(int timeoutMs);
void collectStats();
rtc::Thread *_thread = nullptr;
@@ -124,6 +126,7 @@ private:
std::function _sendSignalingMessage;
std::function _sendTransportMessage;
std::function _signalBarsUpdated;
+ std::function _audioLevelUpdated;
SSRC _ssrcAudio;
SSRC _ssrcVideo;
@@ -158,6 +161,9 @@ private:
bool _enableHighBitrateVideo = false;
bool _isLowCostNetwork = false;
bool _isDataSavingActive = false;
+
+ float _currentAudioLevel = 0.0f;
+ float _currentMyAudioLevel = 0.0f;
std::unique_ptr _audioNetworkInterface;
std::unique_ptr _videoNetworkInterface;
diff --git a/TMessagesProj/jni/voip/tgcalls/NetworkManager.cpp b/TMessagesProj/jni/voip/tgcalls/NetworkManager.cpp
index 4df4b14c7..e1fd6624f 100644
--- a/TMessagesProj/jni/voip/tgcalls/NetworkManager.cpp
+++ b/TMessagesProj/jni/voip/tgcalls/NetworkManager.cpp
@@ -23,6 +23,41 @@ extern "C" {
namespace tgcalls {
+class TgCallsCryptStringImpl : public rtc::CryptStringImpl {
+public:
+ TgCallsCryptStringImpl(std::string const &value) :
+ _value(value) {
+ }
+
+ virtual ~TgCallsCryptStringImpl() override {
+ }
+
+ virtual size_t GetLength() const override {
+ return _value.size();
+ }
+
+ virtual void CopyTo(char* dest, bool nullterminate) const override {
+ memcpy(dest, _value.data(), _value.size());
+ if (nullterminate) {
+ dest[_value.size()] = 0;
+ }
+ }
+ virtual std::string UrlEncode() const override {
+ return _value;
+ }
+ virtual CryptStringImpl* Copy() const override {
+ return new TgCallsCryptStringImpl(_value);
+ }
+
+ virtual void CopyRawTo(std::vector* dest) const override {
+ dest->resize(_value.size());
+ memcpy(dest->data(), _value.data(), _value.size());
+ }
+
+private:
+ std::string _value;
+};
+
class TurnCustomizerImpl : public webrtc::TurnCustomizer {
public:
TurnCustomizerImpl() {
@@ -48,6 +83,7 @@ NetworkManager::NetworkManager(
bool enableTCP,
bool enableStunMarking,
std::vector const &rtcServers,
+ std::unique_ptr proxy,
std::function stateUpdated,
std::function transportMessageReceived,
std::function sendSignalingMessage,
@@ -57,6 +93,7 @@ _enableP2P(enableP2P),
_enableTCP(enableTCP),
_enableStunMarking(enableStunMarking),
_rtcServers(rtcServers),
+_proxy(std::move(proxy)),
_transport(
EncryptedConnection::Type::Transport,
encryptionKey,
@@ -100,6 +137,16 @@ void NetworkManager::start() {
flags |= cricket::PORTALLOCATOR_DISABLE_UDP;
flags |= cricket::PORTALLOCATOR_DISABLE_STUN;
}
+
+ if (_proxy) {
+ rtc::ProxyInfo proxyInfo;
+ proxyInfo.type = rtc::ProxyType::PROXY_SOCKS5;
+ proxyInfo.address = rtc::SocketAddress(_proxy->host, _proxy->port);
+ proxyInfo.username = _proxy->login;
+ proxyInfo.password = rtc::CryptString(TgCallsCryptStringImpl(_proxy->password));
+ _portAllocator->set_proxy("t/1.0", proxyInfo);
+ }
+
_portAllocator->set_flags(_portAllocator->flags() | flags);
_portAllocator->Initialize();
diff --git a/TMessagesProj/jni/voip/tgcalls/NetworkManager.h b/TMessagesProj/jni/voip/tgcalls/NetworkManager.h
index 7f657f0bf..53c53d28a 100644
--- a/TMessagesProj/jni/voip/tgcalls/NetworkManager.h
+++ b/TMessagesProj/jni/voip/tgcalls/NetworkManager.h
@@ -55,6 +55,7 @@ public:
bool enableTCP,
bool enableStunMarking,
std::vector const &rtcServers,
+ std::unique_ptr proxy,
std::function stateUpdated,
std::function transportMessageReceived,
std::function sendSignalingMessage,
@@ -85,6 +86,7 @@ private:
bool _enableTCP = false;
bool _enableStunMarking = false;
std::vector _rtcServers;
+ std::unique_ptr _proxy;
EncryptedConnection _transport;
bool _isOutgoing = false;
std::function _stateUpdated;
diff --git a/TMessagesProj/jni/voip/tgcalls/group/GroupInstanceImpl.cpp b/TMessagesProj/jni/voip/tgcalls/group/GroupInstanceImpl.cpp
new file mode 100644
index 000000000..fcd5be934
--- /dev/null
+++ b/TMessagesProj/jni/voip/tgcalls/group/GroupInstanceImpl.cpp
@@ -0,0 +1,2125 @@
+#include "GroupInstanceImpl.h"
+
+#include
+#include "api/scoped_refptr.h"
+#include "rtc_base/thread.h"
+#include "rtc_base/logging.h"
+#include "api/peer_connection_interface.h"
+#include "api/task_queue/default_task_queue_factory.h"
+#include "media/engine/webrtc_media_engine.h"
+#include "api/audio_codecs/audio_decoder_factory_template.h"
+#include "api/audio_codecs/audio_encoder_factory_template.h"
+#include "api/audio_codecs/opus/audio_decoder_opus.h"
+#include "api/audio_codecs/opus/audio_encoder_opus.h"
+#include "api/audio_codecs/builtin_audio_encoder_factory.h"
+#include "api/audio_codecs/builtin_audio_decoder_factory.h"
+#include "api/rtc_event_log/rtc_event_log_factory.h"
+#include "api/peer_connection_interface.h"
+#include "api/video_track_source_proxy.h"
+#include "system_wrappers/include/field_trial.h"
+#include "api/stats/rtcstats_objects.h"
+#include "modules/audio_processing/audio_buffer.h"
+#include "common_audio/include/audio_util.h"
+#include "common_audio/vad/include/webrtc_vad.h"
+#include "modules/audio_processing/agc2/vad_with_level.h"
+
+#include "ThreadLocalObject.h"
+#include "Manager.h"
+#include "NetworkManager.h"
+#include "VideoCaptureInterfaceImpl.h"
+#include "platform/PlatformInterface.h"
+#include "LogSinkImpl.h"
+
+#include
+#include
+#include
+
+namespace tgcalls {
+
+namespace {
+
+static std::vector splitSdpLines(std::string const &sdp) {
+ std::vector result;
+
+ std::istringstream sdpStream(sdp);
+
+ std::string s;
+ while (std::getline(sdpStream, s, '\n')) {
+ if (s.size() == 0) {
+ continue;
+ }
+ if (s[s.size() - 1] == '\r') {
+ s.resize(s.size() - 1);
+ }
+ result.push_back(s);
+ }
+
+ return result;
+}
+
+static std::vector splitFingerprintLines(std::string const &line) {
+ std::vector result;
+
+ std::istringstream sdpStream(line);
+
+ std::string s;
+ while (std::getline(sdpStream, s, ' ')) {
+ if (s.size() == 0) {
+ continue;
+ }
+ result.push_back(s);
+ }
+
+ return result;
+}
+
+static std::vector getLines(std::vector const &lines, std::string prefix) {
+ std::vector result;
+
+ for (auto &line : lines) {
+ if (line.find(prefix) == 0) {
+ auto cleanLine = line;
+ cleanLine.replace(0, prefix.size(), "");
+ result.push_back(cleanLine);
+ }
+ }
+
+ return result;
+}
+
+static absl::optional parseSdpIntoJoinPayload(std::string const &sdp) {
+ GroupJoinPayload result;
+
+ auto lines = splitSdpLines(sdp);
+
+ std::vector audioLines;
+ bool isAudioLine = false;
+ for (auto &line : lines) {
+ if (line.find("m=audio") == 0) {
+ isAudioLine = true;
+ }
+ if (isAudioLine) {
+ audioLines.push_back(line);
+ }
+ }
+
+ /*std::vector audioSources;
+ for (auto &line : getLines(audioLines, "a=ssrc:")) {
+ std::istringstream iss(line);
+ uint32_t value = 0;
+ iss >> value;
+ if (std::find(audioSources.begin(), audioSources.end(), value) == audioSources.end()) {
+ audioSources.push_back(value);
+ }
+ }
+
+ if (audioSources.size() != 1) {
+ return absl::nullopt;
+ }
+ result.ssrc = audioSources[0];*/
+ result.ssrc = 0;
+
+ auto ufragLines = getLines(lines, "a=ice-ufrag:");
+ if (ufragLines.size() != 1) {
+ return absl::nullopt;
+ }
+ result.ufrag = ufragLines[0];
+
+ auto pwdLines = getLines(lines, "a=ice-pwd:");
+ if (pwdLines.size() != 1) {
+ return absl::nullopt;
+ }
+ result.pwd = pwdLines[0];
+
+ for (auto &line : getLines(lines, "a=fingerprint:")) {
+ auto fingerprintComponents = splitFingerprintLines(line);
+ if (fingerprintComponents.size() != 2) {
+ continue;
+ }
+
+ GroupJoinPayloadFingerprint fingerprint;
+ fingerprint.hash = fingerprintComponents[0];
+ fingerprint.fingerprint = fingerprintComponents[1];
+ fingerprint.setup = "active";
+ result.fingerprints.push_back(fingerprint);
+ }
+
+ return result;
+}
+
+struct StreamSpec {
+ bool isMain = false;
+ uint32_t streamId = 0;
+ uint32_t audioSsrcOrZero = 0;
+ bool isRemoved = false;
+};
+
+static void appendSdp(std::vector &lines, std::string const &line) {
+ lines.push_back(line);
+}
+
+static std::string createSdp(uint32_t sessionId, GroupJoinResponsePayload const &payload, bool isAnswer, std::vector const &bundleStreams) {
+ std::vector sdp;
+
+ appendSdp(sdp, "v=0");
+
+ std::ostringstream sessionIdString;
+ sessionIdString << "o=- ";
+ sessionIdString << sessionId;
+ sessionIdString << " 2 IN IP4 0.0.0.0";
+ appendSdp(sdp, sessionIdString.str());
+
+ appendSdp(sdp, "s=-");
+ appendSdp(sdp, "t=0 0");
+
+ std::ostringstream bundleString;
+ bundleString << "a=group:BUNDLE";
+ for (auto &stream : bundleStreams) {
+ bundleString << " ";
+ if (stream.isMain) {
+ bundleString << "0";
+ } else {
+ bundleString << "audio";
+ bundleString << stream.streamId;
+ }
+ }
+ appendSdp(sdp, bundleString.str());
+
+ appendSdp(sdp, "a=ice-lite");
+
+ for (auto &stream : bundleStreams) {
+ std::ostringstream audioMidString;
+ if (stream.isMain) {
+ audioMidString << "0";
+ } else {
+ audioMidString << "audio";
+ audioMidString << stream.streamId;
+ }
+
+ std::ostringstream mLineString;
+ mLineString << "m=audio ";
+ if (stream.isMain) {
+ mLineString << "1";
+ } else {
+ mLineString << "0";
+ }
+ mLineString << " RTP/SAVPF 111 126";
+
+ appendSdp(sdp, mLineString.str());
+
+ if (stream.isMain) {
+ appendSdp(sdp, "c=IN IP4 0.0.0.0");
+ }
+
+ std::ostringstream mLineMidString;
+ mLineMidString << "a=mid:";
+ mLineMidString << audioMidString.str();
+ appendSdp(sdp, mLineMidString.str());
+
+ if (stream.isMain) {
+ std::ostringstream ufragString;
+ ufragString << "a=ice-ufrag:";
+ ufragString << payload.ufrag;
+ appendSdp(sdp, ufragString.str());
+
+ std::ostringstream pwdString;
+ pwdString << "a=ice-pwd:";
+ pwdString << payload.pwd;
+ appendSdp(sdp, pwdString.str());
+
+ for (auto &fingerprint : payload.fingerprints) {
+ std::ostringstream fingerprintString;
+ fingerprintString << "a=fingerprint:";
+ fingerprintString << fingerprint.hash;
+ fingerprintString << " ";
+ fingerprintString << fingerprint.fingerprint;
+ appendSdp(sdp, fingerprintString.str());
+ appendSdp(sdp, "a=setup:passive");
+ }
+
+ for (auto &candidate : payload.candidates) {
+ std::ostringstream candidateString;
+ candidateString << "a=candidate:";
+ candidateString << candidate.foundation;
+ candidateString << " ";
+ candidateString << candidate.component;
+ candidateString << " ";
+ candidateString << candidate.protocol;
+ candidateString << " ";
+ candidateString << candidate.priority;
+ candidateString << " ";
+ candidateString << candidate.ip;
+ candidateString << " ";
+ candidateString << candidate.port;
+ candidateString << " ";
+ candidateString << "typ ";
+ candidateString << candidate.type;
+ candidateString << " ";
+
+ if (candidate.type == "srflx" || candidate.type == "prflx" || candidate.type == "relay") {
+ if (candidate.relAddr.size() != 0 && candidate.relPort.size() != 0) {
+ candidateString << "raddr ";
+ candidateString << candidate.relAddr;
+ candidateString << " ";
+ candidateString << "rport ";
+ candidateString << candidate.relPort;
+ candidateString << " ";
+ }
+ }
+
+ if (candidate.protocol == "tcp") {
+ if (candidate.tcpType.size() != 0) {
+ candidateString << "tcptype ";
+ candidateString << candidate.tcpType;
+ candidateString << " ";
+ }
+ }
+
+ candidateString << "generation ";
+ candidateString << candidate.generation;
+
+ appendSdp(sdp, candidateString.str());
+ }
+ }
+
+ appendSdp(sdp, "a=rtpmap:111 opus/48000/2");
+ appendSdp(sdp, "a=rtpmap:126 telephone-event/8000");
+ appendSdp(sdp, "a=fmtp:111 minptime=10; useinbandfec=1");
+ appendSdp(sdp, "a=rtcp:1 IN IP4 0.0.0.0");
+ appendSdp(sdp, "a=rtcp-mux");
+ appendSdp(sdp, "a=extmap:1 urn:ietf:params:rtp-hdrext:ssrc-audio-level");
+ appendSdp(sdp, "a=extmap:3 http://www.webrtc.org/experiments/rtp-hdrext/abs-send-time");
+ appendSdp(sdp, "a=extmap:5 http://www.ietf.org/id/draft-holmer-rmcat-transport-wide-cc-extensions-01");
+ appendSdp(sdp, "a=rtcp-fb:111 transport-cc");
+
+ if (isAnswer && stream.isMain) {
+ appendSdp(sdp, "a=recvonly");
+ } else {
+ if (stream.isMain) {
+ appendSdp(sdp, "a=sendrecv");
+ } else {
+ appendSdp(sdp, "a=sendonly");
+ appendSdp(sdp, "a=bundle-only");
+ }
+
+ /*std::ostringstream ssrcGroupString;
+ ssrcGroupString << "a=ssrc-group:FID ";
+ ssrcGroupString << stream.audioSsrc;
+ appendSdp(sdp, ssrcGroupString.str());*/
+
+ if (stream.isRemoved) {
+ appendSdp(sdp, "a=inactive");
+ } else {
+ std::ostringstream cnameString;
+ cnameString << "a=ssrc:";
+ cnameString << stream.audioSsrcOrZero;
+ cnameString << " cname:stream";
+ cnameString << stream.streamId;
+ appendSdp(sdp, cnameString.str());
+
+ std::ostringstream msidString;
+ msidString << "a=ssrc:";
+ msidString << stream.audioSsrcOrZero;
+ msidString << " msid:stream";
+ msidString << stream.streamId;
+ msidString << " audio" << stream.streamId;
+ appendSdp(sdp, msidString.str());
+
+ std::ostringstream mslabelString;
+ mslabelString << "a=ssrc:";
+ mslabelString << stream.audioSsrcOrZero;
+ mslabelString << " mslabel:audio";
+ mslabelString << stream.streamId;
+ appendSdp(sdp, mslabelString.str());
+
+ std::ostringstream labelString;
+ labelString << "a=ssrc:";
+ labelString << stream.audioSsrcOrZero;
+ labelString << " label:audio";
+ labelString << stream.streamId;
+ appendSdp(sdp, labelString.str());
+ }
+ }
+ }
+
+ std::ostringstream result;
+ for (auto &line : sdp) {
+ result << line << "\n";
+ }
+
+ return result.str();
+}
+
+static std::string parseJoinResponseIntoSdp(uint32_t sessionId, uint32_t mainStreamAudioSsrc, GroupJoinResponsePayload const &payload, bool isAnswer, std::vector const &allOtherSsrcs, std::set const &activeOtherSsrcs) {
+
+ std::vector bundleStreams;
+
+ StreamSpec mainStream;
+ mainStream.isMain = true;
+ mainStream.streamId = 0;
+ mainStream.audioSsrcOrZero = mainStreamAudioSsrc;
+ mainStream.isRemoved = false;
+ bundleStreams.push_back(mainStream);
+
+ uint32_t numStreamsToAllocate = (uint32_t)allOtherSsrcs.size();
+ /*if (numStreamsToAllocate < 10) {
+ numStreamsToAllocate = 10;
+ }*/
+
+ for (uint32_t i = 0; i < numStreamsToAllocate; i++) {
+ StreamSpec stream;
+ stream.isMain = false;
+ if (i < allOtherSsrcs.size()) {
+ uint32_t ssrc = allOtherSsrcs[i];
+ stream.audioSsrcOrZero = ssrc;
+ stream.isRemoved = activeOtherSsrcs.find(ssrc) == activeOtherSsrcs.end();
+ stream.streamId = ssrc;
+ } else {
+ stream.audioSsrcOrZero = 0;
+ stream.isRemoved = true;
+ stream.streamId = 1 + (uint32_t)i;
+ }
+ bundleStreams.push_back(stream);
+ }
+
+ return createSdp(sessionId, payload, isAnswer, bundleStreams);
+}
+
+rtc::Thread *makeNetworkThread() {
+ static std::unique_ptr value = rtc::Thread::CreateWithSocketServer();
+ value->SetName("WebRTC-Group-Network", nullptr);
+ value->Start();
+ return value.get();
+}
+
+rtc::Thread *getNetworkThread() {
+ static rtc::Thread *value = makeNetworkThread();
+ return value;
+}
+
+rtc::Thread *makeWorkerThread() {
+ static std::unique_ptr value = rtc::Thread::Create();
+ value->SetName("WebRTC-Group-Worker", nullptr);
+ value->Start();
+ return value.get();
+}
+
+rtc::Thread *getWorkerThread() {
+ static rtc::Thread *value = makeWorkerThread();
+ return value;
+}
+
+rtc::Thread *getSignalingThread() {
+ return Manager::getMediaThread();
+}
+
+rtc::Thread *getMediaThread() {
+ return Manager::getMediaThread();
+}
+
+class FrameEncryptorImpl : public webrtc::FrameEncryptorInterface {
+public:
+ FrameEncryptorImpl() {
+ }
+
+ virtual int Encrypt(cricket::MediaType media_type,
+ uint32_t ssrc,
+ rtc::ArrayView additional_data,
+ rtc::ArrayView frame,
+ rtc::ArrayView encrypted_frame,
+ size_t* bytes_written) override {
+ memcpy(encrypted_frame.data(), frame.data(), frame.size());
+ for (auto it = encrypted_frame.begin(); it != encrypted_frame.end(); it++) {
+ *it ^= 123;
+ }
+ *bytes_written = frame.size();
+ return 0;
+ }
+
+ virtual size_t GetMaxCiphertextByteSize(cricket::MediaType media_type,
+ size_t frame_size) override {
+ return frame_size;
+ }
+};
+
+class FrameDecryptorImpl : public webrtc::FrameDecryptorInterface {
+public:
+ FrameDecryptorImpl() {
+ }
+
+ virtual webrtc::FrameDecryptorInterface::Result Decrypt(cricket::MediaType media_type,
+ const std::vector& csrcs,
+ rtc::ArrayView additional_data,
+ rtc::ArrayView encrypted_frame,
+ rtc::ArrayView frame) override {
+ memcpy(frame.data(), encrypted_frame.data(), encrypted_frame.size());
+ for (auto it = frame.begin(); it != frame.end(); it++) {
+ *it ^= 123;
+ }
+ return webrtc::FrameDecryptorInterface::Result(webrtc::FrameDecryptorInterface::Status::kOk, encrypted_frame.size());
+ }
+
+ virtual size_t GetMaxPlaintextByteSize(cricket::MediaType media_type,
+ size_t encrypted_frame_size) override {
+ return encrypted_frame_size;
+ }
+};
+
+class PeerConnectionObserverImpl : public webrtc::PeerConnectionObserver {
+private:
+ std::function _discoveredIceCandidate;
+ std::function _connectionStateChanged;
+ std::function)> _onTrackAdded;
+ std::function)> _onTrackRemoved;
+ std::function _onMissingSsrc;
+
+public:
+ PeerConnectionObserverImpl(
+ std::function discoveredIceCandidate,
+ std::function connectionStateChanged,
+ std::function)> onTrackAdded,
+ std::function)> onTrackRemoved,
+ std::function onMissingSsrc
+ ) :
+ _discoveredIceCandidate(discoveredIceCandidate),
+ _connectionStateChanged(connectionStateChanged),
+ _onTrackAdded(onTrackAdded),
+ _onTrackRemoved(onTrackRemoved),
+ _onMissingSsrc(onMissingSsrc) {
+ }
+
+ virtual void OnSignalingChange(webrtc::PeerConnectionInterface::SignalingState new_state) override {
+ }
+
+ virtual void OnAddStream(rtc::scoped_refptr stream) override {
+ }
+
+ virtual void OnRemoveStream(rtc::scoped_refptr stream) override {
+ }
+
+ virtual void OnDataChannel(rtc::scoped_refptr data_channel) override {
+ }
+
+ virtual void OnRenegotiationNeeded() override {
+ }
+
+ virtual void OnIceConnectionChange(webrtc::PeerConnectionInterface::IceConnectionState new_state) override {
+ bool isConnected = false;
+ switch (new_state) {
+ case webrtc::PeerConnectionInterface::IceConnectionState::kIceConnectionConnected:
+ case webrtc::PeerConnectionInterface::IceConnectionState::kIceConnectionCompleted:
+ isConnected = true;
+ break;
+ default:
+ break;
+ }
+ _connectionStateChanged(isConnected);
+ }
+
+ virtual void OnStandardizedIceConnectionChange(webrtc::PeerConnectionInterface::IceConnectionState new_state) override {
+ }
+
+ virtual void OnConnectionChange(webrtc::PeerConnectionInterface::PeerConnectionState new_state) override {
+ }
+
+ virtual void OnIceGatheringChange(webrtc::PeerConnectionInterface::IceGatheringState new_state) override {
+ }
+
+ virtual void OnIceCandidate(const webrtc::IceCandidateInterface* candidate) override {
+ std::string sdp;
+ candidate->ToString(&sdp);
+ _discoveredIceCandidate(sdp, candidate->sdp_mline_index(), candidate->sdp_mid());
+ }
+
+ virtual void OnIceCandidateError(const std::string& host_candidate, const std::string& url, int error_code, const std::string& error_text) override {
+ }
+
+ virtual void OnIceCandidateError(const std::string& address,
+ int port,
+ const std::string& url,
+ int error_code,
+ const std::string& error_text) override {
+ }
+
+ virtual void OnIceCandidatesRemoved(const std::vector& candidates) override {
+ }
+
+ virtual void OnIceConnectionReceivingChange(bool receiving) override {
+ }
+
+ virtual void OnIceSelectedCandidatePairChanged(const cricket::CandidatePairChangeEvent& event) override {
+ }
+
+ virtual void OnAddTrack(rtc::scoped_refptr receiver, const std::vector>& streams) override {
+ }
+
+ virtual void OnTrack(rtc::scoped_refptr transceiver) override {
+ /*if (transceiver->receiver()) {
+ rtc::scoped_refptr decryptor(new rtc::RefCountedObject