From a19e3705a228735eacadd713b28d49fe33bb726b Mon Sep 17 00:00:00 2001 From: sijanec Date: Wed, 7 Apr 2021 23:15:16 +0200 Subject: UTF-8, image support, memory leaks fixed --- src/api.c | 180 ++++++++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 117 insertions(+), 63 deletions(-) (limited to 'src/api.c') diff --git a/src/api.c b/src/api.c index 76431cc..9a9bbc8 100644 --- a/src/api.c +++ b/src/api.c @@ -48,9 +48,10 @@ char * sc_api (struct sc_cache * c, char * body, char * headers, int isfmt, char buf_length += readstatus; if (buf_sizeof-buf_length < SC_HTTP_RBUFSIZE) { buf_sizeof *= SC_REALLOC_K; - buf = realloc(buf, sizeof(char)*buf_sizeof); + buf = realloc(buf, sizeof(char)*buf_sizeof); /* this IS safe, no matter how hard valgrind complains */ } } + buf[buf_length++] = '\0'; if (readstatus == -1) SC_LOG(SC_LOG_ERROR, c, "readstatus == -1, endpoint: %s", endpoint_formatted ? endpoint_formatted : endpoint); xmlNanoHTTPClose(r); @@ -123,7 +124,7 @@ int sc_fix_url (char ** h) { /* fixes a (result) URL in-place (removes tracking } /* TODO: be pedantic and remove utm_source and other tracking bullshit */ return 1; } -struct sc_query * sc_query_google (const char * s, struct sc_cache * c, struct sc_query * q) { /* check for cached queries first! */ +struct sc_query * sc_query_google (const char * s, struct sc_cache * c, struct sc_query * q, SC_OPT_TYPE opt) { /* check4cachedB4 */ /* query is in most cases NULL. then it will be allocated and put into sc_cache. otherwise response will be put into passed q. */ /* if query is not NULL, it MUST be initialized */ /* @@ -147,7 +148,13 @@ struct sc_query * sc_query_google (const char * s, struct sc_cache * c, struct s based http request-response based user interface so we can ask the user to complete the captcha. this is not yet implemeted and will be hard work. */ - int rs; + int rs = 1; + char * xpath = NULL; + char * descclass = NULL; + char * titleclass = NULL; + char * imageclass = NULL; + htmlDocPtr xmldoc = NULL; + char * txtdoc = NULL; if (!s || !c) { rs = -1; goto rc; @@ -159,10 +166,7 @@ struct sc_query * sc_query_google (const char * s, struct sc_cache * c, struct s qwasgiven++; char * us = malloc(sizeof(char)*strlen(s)*3+1); urlencode(us, s); - char * xpath = NULL; - char * descclass = NULL; - char * titleclass = NULL; - char * txtdoc = SC_CAPI(c, NULL, NULL, "http://wap.google.com/search?q=%s&num=100", us); + txtdoc = SC_CAPI(c, NULL, NULL, "http://wap.google.com/search?q=%s&num=100&ie=UTF-8%s", us, (opt&SC_OPT_IMAGE) ? "&tbm=isch" : ""); // fprintf(stdout, "%s\n", txtdoc); free(us); if (!txtdoc) { @@ -170,91 +174,141 @@ struct sc_query * sc_query_google (const char * s, struct sc_cache * c, struct s rs = -2; goto rc; } - titleclass = sc_find_class(txtdoc, "{color:#1967D2;font-size:14px;line-height:16px}"); - descclass = sc_find_class(txtdoc, "{word-break:break-word}"); - if (!titleclass || !descclass) { - SC_LOG(SC_LOG_ERROR, c, "!titleclass || !descclass"); - rs = -3; - goto rc; + if (opt & SC_OPT_IMAGE) { + imageclass = sc_find_class(txtdoc, "{font-family:Roboto,Helvetica,Arial,sans-serif}"); + if (!imageclass) { + SC_LOG(SC_LOG_ERROR, c, "!imageclass, txtdoc = %s", txtdoc); + rs = -3; + goto rc; + } + } else { + titleclass = sc_find_class(txtdoc, "{color:#1967D2;font-size:14px;line-height:16px}"); + descclass = sc_find_class(txtdoc, "{word-break:break-word}"); + if (!titleclass || !descclass) { + SC_LOG(SC_LOG_ERROR, c, "!titleclass || !descclass"); + rs = -4; + goto rc; + } } #define SC_GTXF "/html/body//a[contains(@class, '%s')]" /* result a */ #define SC_GTXD /* description */ "../..//table//span[@class='%s']" #define SC_GTXB /* breadcrumbs */ ".//span[@class='%s']" +#define SC_GTXI "//div[@class='%s']//a" #define SC_GTR q->results[q->results_length-1] - xpath = malloc(strlen(titleclass)+strlen(SC_GTXF)); - sprintf(xpath, SC_GTXF, titleclass); /* whenever starts with titleclas */ - htmlDocPtr xmldoc = parseHtmlDocument(txtdoc, NULL); + xpath = malloc(strlen((opt & SC_OPT_IMAGE) ? imageclass : titleclass)+strlen((opt & SC_OPT_IMAGE) ? SC_GTXI : SC_GTXF)); + sprintf(xpath, (opt & SC_OPT_IMAGE) ? SC_GTXI : SC_GTXF, (opt & SC_OPT_IMAGE) ? imageclass : titleclass); + xmldoc = parseHtmlDocument(txtdoc, NULL); if (qwasgiven) /* as you can see, when q is given, queries will be write-locked for the whole XML processing time! */ SC_CWLE(c, c->queries_lock); q->results_length = 0; gnu_code_start; - eachNodeX(xmldoc, xpath, - lambda(void, (xmlNodePtr node, void * data), - { - if (node->type == XML_ELEMENT_NODE) { - xmlAttrPtr href = xmlHasProp(node, BAD_CAST "href"); - if (href) { - char * hreflink = (char *) xmlGetProp(node, BAD_CAST "href"); /* fuck rules, I will rewrite it anyways <= hi future me */ - sc_fix_url(&hreflink); - char * x = malloc(strlen(descclass)+strlen(SC_GTXD)); - char * xbread = malloc(strlen(descclass)+strlen(SC_GTXB)); - sprintf(x, SC_GTXD, descclass /* remember, kids, GNU C is fucking legendary */); - sprintf(xbread, SC_GTXB, descclass /* remember, kids, GNU C is fucking legendary */); - xmlNodePtr descnode = nthNodeXN(node, x, 0); - if (!descnode) /* description may be above, see https://support.google.com/websearch?p=featured_snippets */ - descnode = nthNodeXN(node, "../../div/div", 0); - xmlNodePtr breadnode = nthNodeXN(node, xbread, 0); - free(x); - free(xbread); - if (q->results_sizeof <= q->results_length) - SC_BIGGER_ARRAY(q->results, sc_result); - q->results_length++; - SC_GTR->query = q; - SC_GTR->title = (char *) xmlNodeGetContent(node->children); - if (!SC_GTR->title) { - SC_GTR->title = malloc(strlen(SC_I18N_NO_TITLE)+1); - strcpy(SC_GTR->title, SC_I18N_NO_TITLE); - } - SC_GTR->url = hreflink; - if (!SC_GTR->url) { - SC_GTR->url = malloc(strlen(SC_I18N_NO_HREFLINK)+1); - strcpy(SC_GTR->url, SC_I18N_NO_HREFLINK); - } - SC_GTR->desc = (char *) xmlNodeGetContent(descnode); - if (!SC_GTR->desc) { - SC_GTR->desc = malloc(strlen(SC_I18N_NO_DESCRIPTION)+1); - strcpy(SC_GTR->desc, SC_I18N_NO_DESCRIPTION); - } - SC_GTR->breadcrumbs = (char *) xmlNodeGetContent(breadnode); - if (!SC_GTR->breadcrumbs) { - SC_GTR->breadcrumbs = malloc(strlen(SC_GTR->url)+1); - strcpy(SC_GTR->breadcrumbs, SC_GTR->url); - } - } + void sc_query_google_eachnode (xmlNodePtr node, void * data) { + if (node->type == XML_ELEMENT_NODE) { + xmlAttrPtr href = xmlHasProp(node, BAD_CAST "href"); + if (href) { + char * hreflink = (char *) xmlGetProp(node, BAD_CAST "href"); /* xmlGetProp copies and allocates */ + if (!hreflink) { + SC_LOG(SC_LOG_ERROR, c, "!hreflink"); + rs = -5; + return; + } + if (opt & SC_OPT_IMAGE) { + char * imgurl = NULL; /* do not free those when allocated by sscanf, as they will directly go into the struct. */ + char * imgrefurl = NULL; /* easy, huh? */ + SC_LOG(SC_LOG_DEBUG, c, "hreflink = %s", hreflink); + sscanf(hreflink, "/imgres?imgurl=%m[^&]&imgrefurl=%m[^&]", &imgurl, &imgrefurl); + if (!imgurl && !imgrefurl) { + SC_LOG(SC_LOG_ERROR, c, "!imgurl && !imgrefurl"); + /* rs = -6; */ /* we continue running not fail because of a single picture */ + free(imgurl); + free(imgrefurl); + return; /* check! */ + } + urldecode(imgurl, imgurl); + urldecode(imgrefurl, imgrefurl); + if (q->results_sizeof <= q->results_length) + SC_BIGGER_ARRAY(q->results, sc_result, 1); + q->results_length++; + SC_GTR->query = q; + SC_GTR->title = NULL; /* can't get title from here, would have to load /imgres, which is bloat */ + SC_GTR->url = imgrefurl; + SC_GTR->desc = imgurl; + SC_GTR->breadcrumbs = NULL; + } else { + char * orig_hreflink_for_free = hreflink; + sc_fix_url(&hreflink); + char * x = malloc(strlen(descclass)+strlen(SC_GTXD)); + char * xbread = malloc(strlen(descclass)+strlen(SC_GTXB)); + sprintf(x, SC_GTXD, descclass /* remember, kids, GNU C is fucking legendary */); + sprintf(xbread, SC_GTXB, descclass /* remember, kids, GNU C is fucking legendary */); + xmlNodePtr descnode = nthNodeXN(node, x, 0); + if (!descnode) /* description may be above, see https://support.google.com/websearch?p=featured_snippets */ + descnode = nthNodeXN(node, "../../div/div", 0); + xmlNodePtr breadnode = nthNodeXN(node, xbread, 0); + free(x); + free(xbread); + if (q->results_sizeof <= q->results_length) + SC_BIGGER_ARRAY(q->results, sc_result, 1); + q->results_length++; + SC_GTR->query = q; + char * cp = (char *) xmlNodeGetContent(node->children); + if (cp) { + SC_GTR->title = malloc(strlen(cp)+1); + strcpy(SC_GTR->title, cp); + xmlFree(cp); + } else SC_GTR->title = NULL; + if (hreflink) { + SC_GTR->url = malloc(strlen(hreflink)+1); + strcpy(SC_GTR->url, hreflink); + xmlFree(orig_hreflink_for_free); + } else SC_GTR->url = NULL; + cp = (char *) xmlNodeGetContent(descnode); + if (cp) { + SC_GTR->desc = malloc(strlen(cp)+1); + strcpy(SC_GTR->desc, cp); + xmlFree(cp); + } else SC_GTR->desc = NULL; + cp = (char *) xmlNodeGetContent(breadnode); + if (cp) { + SC_GTR->breadcrumbs = malloc(strlen(cp)+1); + strcpy(SC_GTR->breadcrumbs, cp); + xmlFree(cp); } } - ), - NULL); + } + } + } + eachNodeX(xmldoc, xpath, sc_query_google_eachnode, NULL); gnu_code_end; + if (rs < 0) { + SC_LOG(SC_LOG_ERROR, c, "rs < 0 (rs == %d)", rs); + if (qwasgiven) + SC_CUE(c, c->queries_lock); + goto rc; + } q->cache = c; q->lookup_time = time(NULL); q->engines = SC_ENGINE_GOOGLE; q->string = realloc(q->string, strlen(s)+1); + q->opt = opt; strcpy(q->string, s); if (!qwasgiven) { SC_CWLE(c, c->queries_lock); if (c->queries_sizeof <= c->queries_length) - SC_BIGGER_ARRAY(c->queries, sc_query); + SC_BIGGER_ARRAY(c->queries, sc_query, 0); c->queries_length++; #define SC_GTQ c->queries[c->queries_length-1] SC_GTQ = q; } SC_CUE(c, c->queries_lock); - xmlFreeDoc(xmldoc); rc: + if (!qwasgiven && rs < 0) + sc_query_free(q); + xmlFreeDoc(xmldoc); free(txtdoc); free(titleclass); free(descclass); + free(imageclass); free(xpath); return (rs < 0) ? NULL : q; } -- cgit v1.2.3