From 484b96d850aca9b9144f3b8dd2fb502b25356c22 Mon Sep 17 00:00:00 2001 From: H Lohaus Date: Thu, 7 Dec 2023 07:18:05 +0100 Subject: Add websearch to gui (#1314) * Add websearch to gui * Fix version_check config * Add version badge in README.md * Show version in gui * Add docker hub build * Fix gui backend, improve style --- g4f/gui/server/backend.py | 24 ++++-- g4f/gui/server/internet.py | 187 +++++++++++++++++++++++++++++++++------------ 2 files changed, 156 insertions(+), 55 deletions(-) (limited to 'g4f/gui/server') diff --git a/g4f/gui/server/backend.py b/g4f/gui/server/backend.py index e1abb764..082e31b2 100644 --- a/g4f/gui/server/backend.py +++ b/g4f/gui/server/backend.py @@ -1,8 +1,7 @@ import g4f from flask import request -from .internet import search -from .config import special_instructions +from .internet import get_search_message g4f.debug.logging = True @@ -18,6 +17,10 @@ class Backend_Api: 'function': self.providers, 'methods' : ['GET'] }, + '/backend-api/v2/version': { + 'function': self.version, + 'methods' : ['GET'] + }, '/backend-api/v2/conversation': { 'function': self._conversation, 'methods': ['POST'] @@ -45,6 +48,12 @@ class Backend_Api: provider.__name__ for provider in g4f.Provider.__providers__ if provider.working and provider is not g4f.Provider.RetryProvider ] + + def version(self): + return { + "version": g4f.get_version(), + "lastet_version": g4f.get_lastet_version(), + } def _gen_title(self): return { @@ -53,14 +62,15 @@ class Backend_Api: def _conversation(self): try: - #jailbreak = request.json['jailbreak'] - #internet_access = request.json['meta']['content']['internet_access'] - #conversation = request.json['meta']['content']['conversation'] + #jailbreak = request.json['jailbreak'] + web_search = request.json['meta']['content']['internet_access'] messages = request.json['meta']['content']['parts'] + if web_search: + messages[-1]["content"] = get_search_message(messages[-1]["content"]) model = request.json.get('model') model = model if model else g4f.models.default - provider = request.json.get('provider', 'Auto').replace('g4f.Provider.', '') - provider = provider if provider != "Auto" else None + provider = request.json.get('provider').replace('g4f.Provider.', '') + provider = provider if provider and provider != "Auto" else None if provider != None: provider = g4f.Provider.ProviderUtils.convert.get(provider) diff --git a/g4f/gui/server/internet.py b/g4f/gui/server/internet.py index 220a6e7c..9a14e25f 100644 --- a/g4f/gui/server/internet.py +++ b/g4f/gui/server/internet.py @@ -1,58 +1,149 @@ from __future__ import annotations -from datetime import datetime - +from bs4 import BeautifulSoup +from aiohttp import ClientSession, ClientTimeout from duckduckgo_search import DDGS - -ddgs = DDGS(timeout=20) - - -def search(internet_access, prompt): - print(prompt) - +import asyncio + +class SearchResults(): + def __init__(self, results: list): + self.results = results + + def __iter__(self): + yield from self.results + + def __str__(self): + search = "" + for idx, result in enumerate(self.results): + if search: + search += "\n\n\n" + search += f"Title: {result.title}\n\n" + if result.text: + search += result.text + else: + search += result.snippet + search += f"\n\nSource: [[{idx}]]({result.url})" + return search + +class SearchResultEntry(): + def __init__(self, title: str, url: str, snippet: str, text: str = None): + self.title = title + self.url = url + self.snippet = snippet + self.text = text + + def set_text(self, text: str): + self.text = text + +def scrape_text(html: str, max_words: int = None) -> str: + soup = BeautifulSoup(html, "html.parser") + for exclude in soup(["script", "style"]): + exclude.extract() + for selector in [ + "main", + ".main-content-wrapper", + ".main-content", + ".emt-container-inner", + ".content-wrapper", + "#content", + "#mainContent", + ]: + select = soup.select_one(selector) + if select: + soup = select + break + # Zdnet + for remove in [".c-globalDisclosure"]: + select = soup.select_one(remove) + if select: + select.extract() + clean_text = "" + for paragraph in soup.select("p"): + text = paragraph.get_text() + for line in text.splitlines(): + words = [] + for word in line.replace("\t", " ").split(" "): + if word: + words.append(word) + count = len(words) + if not count: + continue + if max_words: + max_words -= count + if max_words <= 0: + break + if clean_text: + clean_text += "\n" + clean_text += " ".join(words) + + return clean_text + +async def fetch_and_scrape(session: ClientSession, url: str, max_words: int = None) -> str: try: - if not internet_access: - return [] - - results = duckduckgo_search(q=prompt) - - if not search: - return [] + async with session.get(url) as response: + if response.status == 200: + html = await response.text() + return scrape_text(html, max_words) + except: + return + +async def search(query: str, n_results: int = 5, max_words: int = 2500, add_text: bool = True) -> SearchResults: + with DDGS() as ddgs: + results = [] + for result in ddgs.text( + query, + region="wt-wt", + safesearch="moderate", + timelimit="y", + ): + results.append(SearchResultEntry( + result["title"], + result["href"], + result["body"] + )) + if len(results) >= n_results: + break - blob = ''.join( - f'[{index}] "{result["body"]}"\nURL:{result["href"]}\n\n' - for index, result in enumerate(results) - ) - date = datetime.now().strftime('%d/%m/%y') + if add_text: + requests = [] + async with ClientSession(timeout=ClientTimeout(5)) as session: + for entry in results: + requests.append(fetch_and_scrape(session, entry.url, int(max_words / (n_results - 1)))) + texts = await asyncio.gather(*requests) + + formatted_results = [] + left_words = max_words; + for i, entry in enumerate(results): + if add_text: + entry.text = texts[i] + if left_words: + left_words -= entry.title.count(" ") + 5 + if entry.text: + left_words -= entry.text.count(" ") + else: + left_words -= entry.snippet.count(" ") + if 0 > left_words: + break + formatted_results.append(entry) + + return SearchResults(formatted_results) + + +def get_search_message(prompt) -> str: + try: + search_results = asyncio.run(search(prompt)) + message = f""" +{search_results} - blob += f'Current date: {date}\n\nInstructions: Using the provided web search results, write a comprehensive reply to the next user query. Make sure to cite results using [[number](URL)] notation after the reference. If the provided search results refer to multiple subjects with the same name, write separate answers for each subject. Ignore your previous response if any.' - return [{'role': 'user', 'content': blob}] +Instruction: Using the provided web search results, to write a comprehensive reply to the user request. +Make sure to add the sources of cites using [[Number]](Url) notation after the reference. Example: [[0]](http://google.com) +If the provided search results refer to multiple subjects with the same name, write separate answers for each subject. +User request: +{prompt} +""" + return message except Exception as e: print("Couldn't search DuckDuckGo:", e) - print(e.__traceback__.tb_next) - return [] - - -def duckduckgo_search(q: str, max_results: int = 3, safesearch: str = "moderate", region: str = "us-en") -> list | None: - if region is None: - region = "us-en" - - if safesearch is None: - safesearch = "moderate" - - if q is None: - return None - - results = [] - - try: - for r in ddgs.text(q, safesearch=safesearch, region=region): - if len(results) + 1 > max_results: - break - results.append(r) - except Exception as e: - print(e) - - return results + return prompt -- cgit v1.2.3