Ad
Code
Diff
  • #include <stdlib.h>
    #include <string.h>
    
    char *reverse_string(const char *word) {
        if (!word) return NULL;
    
        size_t len = strlen(word);
        char *res = malloc(len + 1);
        if (!res) return NULL;
    
        for (size_t i = 0; i < len; i++)
            res[i] = word[len - i - 1];
    
        res[len] = '\0';
        return res;
    }
    
    • #include <stdlib.h>
    • #include <string.h>
    • char *reverse_string(const char *word)
    • {
    • size_t len = strlen(word);
    • char *res = malloc(len + 1);
    • for (size_t i = 0; i < len; i++)
    • res[i] = word[len - i - 1];
    • return res[len] = '\0', res;
    • }
    • char *reverse_string(const char *word) {
    • if (!word) return NULL;
    • size_t len = strlen(word);
    • char *res = malloc(len + 1);
    • if (!res) return NULL;
    • for (size_t i = 0; i < len; i++)
    • res[i] = word[len - i - 1];
    • res[len] = '\0';
    • return res;
    • }

The Ultimate String Uppercase Conversion Challenge

Description:

This is one of the most difficult string uppercase optimization problems. The goal is to implement an ultra-efficient, CPU-optimized, vectorized, and parallelized version of an uppercase conversion function.

The Challenge:

  • Use SIMD (Single Instruction, Multiple Data) optimizations to process multiple characters in parallel.
  • Ensure zero unnecessary memory allocations and use alignment-aware memory handling for cache efficiency.
  • Eliminate unnecessary branching by leveraging branch prediction-aware looping strategies to minimize CPU stalls.
  • Handle large strings at extreme speeds, scaling across modern CPU architectures (x86, ARM, RISC-V).
  • Avoid memory leaks, undefined behavior, and unnecessary performance bottlenecks.

This problem requires mastery of low-level optimizations, CPU pipeline efficiency, and SIMD vectorization.


Test Cases Include:

  • Basic cases: Convert small strings to uppercase.
  • Edge cases: Handle empty strings and NULL inputs safely.
  • Performance cases: Process millions of characters efficiently.

Challenge for Code Warriors:

  • Can you optimize further using AVX-512 or custom assembly for even greater speed?
  • Can you integrate multi-threading to utilize multi-core parallelism?
  • Can you write a self-adapting function that selects the best SIMD instruction set at runtime?

This challenge is designed for those who want to push the limits of performance engineering. If you can improve this, you are among the most skilled low-level performance engineers.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <immintrin.h>

char *to_uppercase(const char *input) {
    if (!input) return NULL;

    size_t len = strlen(input);
    char *result = (char *)aligned_alloc(32, len + 1);
    if (!result) return NULL;

    size_t i = 0;
    __m128i lower_a = _mm_set1_epi8('a' - 1);
    __m128i upper_z = _mm_set1_epi8('z' + 1);
    __m128i diff = _mm_set1_epi8('A' - 'a');

    for (; i + 16 <= len; i += 16) {
        __m128i chunk = _mm_loadu_si128((__m128i *)(input + i));
        __m128i mask1 = _mm_cmpgt_epi8(chunk, lower_a);
        __m128i mask2 = _mm_cmplt_epi8(chunk, upper_z);
        __m128i mask = _mm_and_si128(mask1, mask2);
        __m128i upper = _mm_sub_epi8(chunk, diff);
        __m128i result_chunk = _mm_or_si128(_mm_and_si128(mask, upper), _mm_andnot_si128(mask, chunk));
        _mm_storeu_si128((__m128i *)(result + i), result_chunk);
    }

    for (; i < len; i++)
        result[i] = (input[i] >= 'a' && input[i] <= 'z') ? input[i] - ('a' - 'A') : input[i];

    result[len] = '\0';
    return result;
}