Using UTF-8 Datastreams in cURL, when is the datastream converted from UTF-8 to ASCII?-CodePudding

When using cURL to take character streams off of the internet, when is the datastream converted from a multibyte datatype to a single byte character array?

I wrote a program here, which appears to work using ASCII in the callback function.

However, I wrote another program that uses UTF-8 with wchar_t datatypes, which also appears to work. The datastream does not appear to differentiate between the two datatypes, even though a wchar_t type is 4 bytes on my machine and a char is 1 byte.

I guess that there is some sort of type conversion going on transparent to this program, but I am not aware of it (I think that in UTF-8 ASCII characters still take 1 byte of memory, but when a program uses wchar_t datatypes the system pads regular ascii characters with zeros converting them to 4 bytes, but this was not something the programmer implemented...).

#include "multicurl.h"

#define MAX_WAIT_MSECS 5*1000 /* Wait max. 5 seconds */

/*  The largest difference between the ASCII and UTF-8 variations of this program is that this callback function is now dealing with an array of wchar_t blocks rather than chars which are always 1 byte long, but it still works the same basic way. */
static size_t write_callback(wchar_t *ptr, size_t size, size_t nmemb, void *userdata){// cURL callback function [read in datastream to memory]
    // This prototype is provided by cURL, with an argument at the end for our data structure.
    // This function is repeatedly called by cURL until there is no more data in the data stream; *ptr [it is assumed cURL handles memory management for this pointer].
    
    size_t realsize = nmemb * size;// The number of bytes in the datastream [there is no NULL char]
    MemType *mem = (MemType *)userdata;
    wchar_t *tmp = realloc(mem->memory, mem->size    realsize   sizeof(wchar_t) );// We add 1 wchar_t unit for the NULL character.

    if (tmp == NULL){
        printf("Not Enough Memory, realloc returned NULL.\n");
        exit(EXIT_FAILURE);
    }

    mem->memory = tmp;
    memcpy(&(mem->memory[ mem->size /  sizeof(wchar_t) ]), ptr, realsize );// Starting at the last element copy in datastream [it overwrites the last element]
    mem->size  = realsize;// The actual size, in bytes, is realsize   ( 1 * sizeof(wchar_t) ), however realsize gives us the location of the last element.
    mem->memory[ mem->size / sizeof(wchar_t) ] = 0;// The datastream doesn't include a NULL character, so we zeroize the last element.
    // We overwrite the NULL character {the zeroized element} on the next callback iteration, if any.

    return (size * nmemb);// cURL crosschecks the datastream with this return value.
}

void *SetUpCurlHandle(CURLM * mh, wchar_t *utf8_url, MemType *output){
// Take in a multi handle pointer address, a URL and a struct pointer address, set up the curl easy handle and add it to the multi handle.

    /* Convert our UTF-8 URL string to a regular ASCII URL string. */
    char* url = (char*) malloc ( wcslen( utf8_url )   1 );
    wcstombs(url, utf8_url, wcslen( utf8_url ) * sizeof( wchar_t ) );

    CURL *hnd = NULL;
    output->memory = malloc( sizeof( wchar_t ) );              // Initialize the memory component of the structure.
    output->size = 0;                                           // Initialize the size component of the structure.

    // Initialize the cURL handle.
    hnd = curl_easy_init();

    if(hnd){

        // Setup the cURL options.
        curl_easy_setopt(hnd, CURLOPT_BUFFERSIZE, 102400L);
        curl_easy_setopt(hnd, CURLOPT_URL, url);// Set the request URL
        curl_easy_setopt(hnd, CURLOPT_NOPROGRESS, 1L);
        curl_easy_setopt(hnd, CURLOPT_USERAGENT, "curl/7.80.0");
        curl_easy_setopt(hnd, CURLOPT_MAXREDIRS, 50L);
        curl_easy_setopt(hnd, CURLOPT_HTTP_VERSION, (long)CURL_HTTP_VERSION_2TLS);
        curl_easy_setopt(hnd, CURLOPT_FTP_SKIP_PASV_IP, 1L);
        curl_easy_setopt(hnd, CURLOPT_TCP_KEEPALIVE, 1L);
        curl_easy_setopt(hnd, CURLOPT_WRITEFUNCTION, write_callback);// The callback function to write data to.
        curl_easy_setopt(hnd, CURLOPT_WRITEDATA, (void *)output);// Send the address of the data struct to callback func.
        //curl_easy_setopt(hnd, CURLOPT_VERBOSE, 1);

        curl_multi_add_handle(mh, hnd);
    }else{
        output->memory[0] = '\0';
    }    
    return NULL;// The output struct was passed by reference no need to return anything.
}

CURLM *SetUpMultiCurlHandle(){
    curl_global_init(CURL_GLOBAL_ALL);

    CURLM * mh = curl_multi_init();
    return mh;
}

void *PerformMultiCurl(CURLM * mh) 
/*Take in a preset multi handle, request data from the remote server asynchronously {it's assumed cURL is using threads transparent to the calling program}.
   Remove the handles from memory.*/
{
    CURLMsg *msg=NULL;
    CURL *hnd = NULL;
    CURLcode return_code = 0;
    int still_running = 0;
    int msgs_left = 0;

    curl_multi_perform(mh, &still_running);// Perform the requests.
    do {
        int numfds=0;
        int res = curl_multi_wait(mh, NULL, 0, MAX_WAIT_MSECS, &numfds);
        if(res != CURLM_OK) {
            fprintf(stderr, "error: curl_multi_wait() returned %d\n", res);
            return NULL;
        }
        curl_multi_perform(mh, &still_running);
        
       /* Without this loop the program will proceed to the next statement, most likely before the messages are retrieved from the server.
           The easy handle requests are conducted asynchronously, but one multi handle request is obviously conducted sequentially (can use pthreads to make asynchronous multi requests).*/
    } while(still_running); 
    
    
    /* This portion of the code will clean up and remove the handles from memory, you could change this to make them more persistent */
    while ((msg = curl_multi_info_read(mh, &msgs_left))) {
        if (msg->msg == CURLMSG_DONE) {
            hnd = msg->easy_handle;

            return_code = msg->data.result;
            if(return_code!=CURLE_OK) {
                fprintf(stderr, "CURL error code: %d\n", msg->data.result);
                continue;
            }

            curl_multi_remove_handle(mh, hnd);
            curl_easy_cleanup(hnd);
            hnd = NULL;
        }
        else {
            fprintf(stderr, "error: after curl_multi_info_read(), CURLMsg=%d\n", msg->msg);
        }
    }

    curl_multi_cleanup(mh);
    curl_global_cleanup();
    return NULL;
}

The full UTF-8 variation of this program can be found here

CodePudding user response：

As you would expect, it doesn't work. libcurl has no way to know the function expects a wchar_t* when it should expect a char*

If you inspect MyOutputStruct1.memory[0], you'll find it doesn't contain what it should. For example, when requesting https://stackoverflow.com, it contains 0x4f44213c. This is obviously wrong since that's far outside the range of valid Code Points. This is actually the first four Code Points (<!DO) jammed into one wchar_t (in LE order).

It kind of appears to work because of a second bug. When printing a wide string, you need to use %ls, not %s.

wprintf(L"Output:\n%s\n", MyOutputStruct1.memory);

should be

printf("Output:\n%ls\n", MyOutputStruct1.memory);
// -or-
wprintf(L"Output:\n%ls\n", MyOutputStruct1.memory);

Basically, the code expects a char* throughout. The pointer's type is wchar_t*, but it's used as a char* everywhere. As such, the two bugs mostly "cancel out" in the program in question. (I didn't look, but I expect a problem with inputs with a length that isn't divisible by sizeof(wchar_t).) If the pointer had actually be used as a wchar_t* (e.g. if it's elements had been inspected or if it had been passed to a w function), the problem would have been obvious.

CodePudding user response：

As was stated in the comment section all this really needed was a UTF-8 parser. Chars can hold UTF-8 but we cannot easily address each character individually without converting them to some other datatype [some UTF-8 characters are larger than 1 byte]. So I wrote a parser with the help of libutf-8.

/* gcc unicode.c -o unicode -lutf-8 
This program makes use of libutf-8.
http://www.whizkidtech.redprince.net/i18n/
*/

#include <stdio.h>
#include <stdlib.h>
#include<string.h>

#include <locale.h>
#include <utf-8.h>

int* parse_UTF8_bitstream(int *len, const char* input_stream)
/* Parse a UTF-8 char bytestream into a 4-byte wide integer bytestream [so we can address each UTF-8 character individually] */
{
    *len = 0; // This will give us the number of wide-characters not counting NULL.
    int i = 0;
    int n;
    unsigned int *output = malloc ( sizeof( unsigned int) );
    unsigned int *temp;
    while ( input_stream[ i ] ){
        temp = ( unsigned int*) realloc(output, (*len   1) * sizeof( unsigned int) );
        output = temp;
        output[ *len ] = sgetu8( (unsigned char *) &input_stream[ i ], &n);
        i = n;
        *len = *len   1;
    }
    return (int*)output; // This is our wide character string.
}


int main(void)
{
  setlocale(LC_ALL, "");
  const char *string ="ايه الاخبار"; 
  printf("%s\n",string);
  printf("LENGTH: %lu 1-Byte Characters\n\n", strlen(string));
  
  const char *string2 = "при";
  printf("%s\n",string2);
  printf("LENGTH: %lu  1-Byte Characters\n", strlen(string2));
  
  int len;
  int* outputstream = parse_UTF8_bitstream(&len, string );
  
  printf("%ls\n", outputstream);
  printf("LENGTH: %d Wide Characters\n", len);
  for(int i = 0; i<len; i  ){
    printf("%lc\n", outputstream[ i ]);
  }
  
  int len2;
  int* outputstream2 = parse_UTF8_bitstream(&len2, string2 );
  
  printf("%ls\n", outputstream2);
  printf("LENGTH: %d Wide Characters\n", len2);
  for(int i = 0; i<len2; i  ){
    printf("%lc\n", outputstream2[ i ]);
  }

  exit(0);
}