Skip to content
This repository has been archived by the owner on Oct 14, 2023. It is now read-only.

Commit

Permalink
0.2.4 动态惩罚机器,但是暂时不可用
Browse files Browse the repository at this point in the history
  • Loading branch information
sudoskys committed Feb 3, 2023
1 parent 3e2e8d8 commit f09433c
Show file tree
Hide file tree
Showing 11 changed files with 120 additions and 67 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "llm_kira"
version = "0.2.3"
version = "0.2.4"
description = "chatbot client for llm"
authors = ["sudoskys <[email protected]>"]
maintainers = [
Expand Down
4 changes: 2 additions & 2 deletions src/llm_kira/client/Optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def forgetting_curve(x):
# 相似度检索
for i in range(0, len(memory)):
ask, reply = MsgFlow.get_content(memory[i], sign=False)
_ask_diff = Utils.cosion_sismilarity(pre=prompt, aft=ask)
_ask_diff = Utils.cosion_similarity(pre=prompt, aft=ask)
_ask_diff = _ask_diff * 100
score = _ask_diff if _ask_diff < 90 else 0
if score != 0:
Expand Down Expand Up @@ -256,7 +256,7 @@ def forgetting_curve(x):
# 相似度检索
for i in range(0, len(memory)):
ask, reply = MsgFlow.get_content(memory[i], sign=False)
_ask_diff = Utils.cosion_sismilarity(pre=prompt, aft=ask)
_ask_diff = Utils.cosion_similarity(pre=prompt, aft=ask)
_ask_diff = _ask_diff * 100
score = _ask_diff if _ask_diff < 90 else 0
if score != 0:
Expand Down
13 changes: 5 additions & 8 deletions src/llm_kira/client/anchor.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,19 +328,16 @@ async def predict(self,
foot=_prompt_foot,
token=_llm_result_limit)

# Clean
_prompt_body = [item for item in _prompt_body if item]

# Stick Them
_prompt = _prompt + f"\n{self.profile.restart_name}:"
if not prompt_iscode:
_prompt = _prompt.replace("\n\n", "\n")

# ODO
# logger.warning(_prompt)

# Get
llm_result = await self.llm.run(prompt=_prompt, predict_tokens=predict_tokens, llm_param=llm_param)
llm_result = await self.llm.run(prompt=_prompt,
validate=_prompt_body,
predict_tokens=predict_tokens,
llm_param=llm_param)
llm_result: LlmReturn

# Parse Result
Expand All @@ -350,4 +347,4 @@ async def predict(self,
return ChatBotReturn(conversation_id=f"{self.profile.conversation_id}",
llm=llm_result,
ask=prompt_text,
reply=self.llm.parse_reply(llm_result.reply))
reply=self.llm.parse_reply(llm_result.reply).rstrip("<im_end>"))
5 changes: 4 additions & 1 deletion src/llm_kira/client/llms/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,9 @@ def get_token_limit(self) -> int:
return 2000

@abstractmethod
def tokenizer(self, text):
def tokenizer(self, text, raw=False) -> Union[int, list]:
if raw:
return []
return len(text)

@abstractmethod
Expand Down Expand Up @@ -80,6 +82,7 @@ def parse_usage(response) -> Optional[int]:
@abstractmethod
async def run(self,
prompt: str,
validate: Union[List[str], None] = None,
predict_tokens: int = 500,
llm_param: LlmBaseParam = None,
) -> Optional[LlmReturn]:
Expand Down
55 changes: 41 additions & 14 deletions src/llm_kira/client/llms/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,10 +111,12 @@ def __init__(self, profile: Conversation,
def get_token_limit(self) -> int:
return self.token_limit

def tokenizer(self, text) -> int:
def tokenizer(self, text, raw: bool = False) -> Union[int, list]:
gpt_tokenizer = tiktoken.get_encoding("gpt2")
_token = len(gpt_tokenizer.encode(text))
return _token
_token = gpt_tokenizer.encode(text)
if raw:
return _token
return len(_token)

@staticmethod
def parse_response(response) -> list:
Expand Down Expand Up @@ -152,6 +154,7 @@ def resize_sentence(self, text: str, token: int) -> str:
return text

def resize_context(self, head: list, body: list, foot: list, token: int) -> str:
body = [item for item in body if item]
token = token if token > 5 else 5
_head = '\n'.join(head) + "\n"
_body = "\n".join(body) + "\n"
Expand Down Expand Up @@ -188,25 +191,29 @@ def model_context_size(model_name: str) -> int:

async def run(self,
prompt: str,
validate: Union[List[str], None] = None,
predict_tokens: int = 500,
llm_param: OpenAiParam = None
) -> LlmReturn:
"""
异步的,得到对话上下文
:param predict_tokens: 限制返回字符数量
:param validate: 惩罚验证列表
:param prompt: 提示词
:param llm_param: 参数表
:return:
"""

_request_arg = {
"top_p": 1,
"n": 1
}
_request_arg: dict

# Kwargs
if llm_param:
_request_arg.update(llm_param.invocation_params)

if validate is None:
validate = []
_request_arg.update(model=str(llm_param.model_name),
prompt=str(prompt),
max_tokens=int(predict_tokens),
Expand All @@ -217,26 +224,46 @@ async def run(self,
f"{self.profile.restart_name}:"],
)

# Penalty
if self.auto_penalty:
# THINK ABOUT HOT CAKE
_frequency_penalty, _presence_penalty, _temperature = Detect().gpt_tendency_arg(prompt=prompt)
# SOME HOT CAKE
# Adjust Penalty
if self.auto_penalty and validate:
# Cook
_frequency_penalty, _presence_penalty, _temperature = Detect().gpt_tendency_arg(prompt=prompt,
memory=validate,
tokenizer=self.tokenizer
)
# Some Update
_request_arg.update({
"frequency_penalty": float(_frequency_penalty),
"presence_penalty": float(_presence_penalty),
"temperature": float(_temperature),
})
# logit_bias

# 校准字节参数
if not _request_arg.get("logit_bias"):
_request_arg["logit_bias"] = {}
_request_arg.pop("logit_bias")
# Req

# 校准温度和惩罚参数
if _request_arg.get("frequency_penalty"):
_frequency_penalty = _request_arg["frequency_penalty"]
_frequency_penalty = _frequency_penalty if -2.0 < _frequency_penalty else -1.9
_frequency_penalty = _frequency_penalty if _frequency_penalty < 2.0 else 1.9
_request_arg["frequency_penalty"] = _frequency_penalty

if _request_arg.get("presence_penalty"):
_presence_penalty = _request_arg["presence_penalty"]
_presence_penalty = _presence_penalty if -2.0 < _presence_penalty else -1.9
_presence_penalty = _presence_penalty if _presence_penalty < 2.0 else 1.9
_request_arg["presence_penalty"] = _presence_penalty

if _request_arg.get("temperature"):
_temperature = _request_arg["temperature"]
_request_arg["temperature"] = _temperature if 0 < _temperature < 1 else 0.9

# 请求
response = await Completion(api_key=self.__api_key, call_func=self.__call_func).create(
**_request_arg
)

# Reply
reply = self.parse_response(response)
usage = self.parse_usage(response)
return LlmReturn(model_flag=llm_param.model_name,
Expand Down
4 changes: 2 additions & 2 deletions src/llm_kira/client/module/plugin/_plugin_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def nlp_filter_list(prompt, material: list):
continue
_pre = material[i]
_afe = material[i + 1]
sim = Utils.cosion_sismilarity(pre=_pre, aft=_afe)
sim = Utils.cosion_similarity(pre=_pre, aft=_afe)
if sim > 0.7:
_remo = _afe if len(_afe) > len(_pre) else _pre
# 移除过于相似的
Expand All @@ -129,7 +129,7 @@ def nlp_filter_list(prompt, material: list):
material = list(material_.keys())
_top_table = {}
for item in material:
_top_table[item] = Utils.cosion_sismilarity(pre=prompt, aft=item)
_top_table[item] = Utils.cosion_similarity(pre=prompt, aft=item)
material = {k: v for k, v in _top_table.items() if v > 0.15}
# 搜索引擎比相似度算法靠谱所以注释掉了
# material = OrderedDict(sorted(material.items(), key=lambda t: t[1]))
Expand Down
8 changes: 7 additions & 1 deletion src/llm_kira/client/module/plugin/time.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,13 @@ class Week(object):
def __init__(self):
self._server = None
self._text = None
self._time = ["time", "多少天", "几天", "时间", "几点", "今天", "昨天", "明天", "几月", "几月", "几号",
self._time = ["time", "今年", "2022",
"2023", "2024", "year", "day",
"多少天", "几几年",
"几天", "时间",
"几点", "今天",
"昨天", "明天",
"几月", "几号",
"几个月",
"天前"]

Expand Down
2 changes: 1 addition & 1 deletion src/llm_kira/client/module/plugin/week.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def __init__(self):
self._server = None
self._text = None
self._week_list = ["星期一", "星期二", "星期三", "星期四", "星期五", "星期六", "星期日"]
self._week_key = ["星期", "星期几", "时间", "周几", "周一", "周二", "周三", "周四", "周五", "周六"]
self._week_key = ["星期", "星期几", "week", "时间", "周几", "周一", "周二", "周三", "周四", "周五", "周六"]

def requirements(self):
return []
Expand Down
2 changes: 2 additions & 0 deletions src/llm_kira/client/text_analysis_tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,5 @@
from .api.summarization.textrank_summarization import TextRankSummarization
from .api.text_similarity.simhash import SimHashSimilarity
from .api.sentiment.sentiment import SentimentAnalysis
from .api.text_similarity.edit import EditSimilarity
from .api.text_similarity.cosion import CosionSimilarity
84 changes: 48 additions & 36 deletions src/llm_kira/utils/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
# @Github :sudoskys
import re
import random
from typing import Union, Callable

from .langdetect.langdetect import LangDetector
from ..client.text_analysis_tools.api.keywords.tfidf import TfidfKeywords
Expand All @@ -13,12 +14,20 @@
from ..client.text_analysis_tools.api.summarization.tfidf_summarization import TfidfSummarization
from ..client.text_analysis_tools.api.text_similarity.simhash import SimHashSimilarity
from ..client.text_analysis_tools.api.text_similarity.cosion import CosionSimilarity
from ..client.text_analysis_tools.api.text_similarity.edit import EditSimilarity
from ..client.text_analysis_tools.api.keyphrase.keyphrase import KeyPhraseExtraction
import tiktoken

gpt_tokenizer = tiktoken.get_encoding("gpt2")


def default_gpt_tokenizer(text, raw: bool = False) -> Union[int, list]:
_token = gpt_tokenizer.encode(text)
if raw:
return _token
return len(_token)


class Detect(object):
@staticmethod
def isNeedHelp(sentence) -> bool:
Expand Down Expand Up @@ -87,59 +96,52 @@ def get_text_language(sentence: str):
lang_type = detect(text=sentence.replace("\n", "").replace("\r", ""))[0][0].upper()
return lang_type

def gpt_tendency_arg(self, prompt: str, memory: list = None, lang: str = "CN") -> tuple:
def gpt_tendency_arg(self, prompt: str,
memory: list = None,
tokenizer: Callable[[str, bool], Union[int, list]] = default_gpt_tokenizer,
lang: str = "CN") -> tuple:

if memory is None:
memory = []
# 代码
temperature = 0.9
frequency_penalty = 0
presence_penalty = 0

if self.isCode(sentence=prompt):
temperature = 0.9
frequency_penalty = 0
presence_penalty = 0
return frequency_penalty, presence_penalty, temperature
if self.isNeedHelp(sentence=prompt):
temperature = 0.9
frequency_penalty = 0
presence_penalty = 0
return frequency_penalty, presence_penalty, temperature

# 普通情况
temperature = 0.9
if self.isNeedHelp(sentence=prompt):
temperature -= 0.2
frequency_penalty -= 0.1
presence_penalty -= 0.1

# 控制随机数的精度round(数值,精度)
presence_penalty = 0 + round(random.uniform(-1, 1) / 10, 2)
frequency_penalty = 0 + round(random.uniform(-1, 1) / 10, 2)
# presence_penalty += round(random.uniform(-1, 1) / 10, 2)
# frequency_penalty += round(random.uniform(-1, 1) / 10, 2)
_sentiment_score = Utils.sentiment(sentence=prompt).get("score")
while _sentiment_score > 1.5 or _sentiment_score < -1.5:
_sentiment_score = _sentiment_score / 10
_sentiment_score = 0.1 if 0.05 < _sentiment_score < 0.1 else _sentiment_score
_sentiment_score = -0.1 if -0.1 < _sentiment_score < -0.05 else _sentiment_score
# 不谈论新话题
presence_penalty -= _sentiment_score * 0.4
# 拒绝重复
frequency_penalty += _sentiment_score * 0.4

# NEW 高兴正数,就不扭转
presence_penalty -= _sentiment_score * 1.2

# REPEAT 高兴正数,则采用默认加法惩罚
frequency_penalty += _sentiment_score * 0.8
_memory_len = len(memory)
# 对话结束就拒绝复读,扭转为正数!
if _memory_len > 20:
while _memory_len > 20:
_memory_len = _memory_len - 20
if _memory_len / 20 > 0.7:
frequency_penalty = abs(frequency_penalty)
# 验证记忆体
if len(memory) > 3:
# 计算回复指数指标
_token = tokenizer("".join(memory[-4:]), True)
_repeat_score = 2 * (0.8 - len(set(_token)) / len(_token))
frequency_penalty = frequency_penalty + _repeat_score
print(_repeat_score)

# FIX
# Fix
temperature = round(temperature, 1)
presence_penalty = round(presence_penalty, 1)
frequency_penalty = round(frequency_penalty, 1)

# CHECK
temperature = temperature if 0 < temperature < 1 else 0.9

presence_penalty = presence_penalty if -1.8 < presence_penalty else -0.1
presence_penalty = presence_penalty if presence_penalty < 1.8 else 0.1

frequency_penalty = frequency_penalty if -1.8 < frequency_penalty else -0.1
frequency_penalty = frequency_penalty if frequency_penalty < 1.8 else 0.1
# Check
return frequency_penalty, presence_penalty, temperature


Expand Down Expand Up @@ -178,7 +180,7 @@ def tfidf_summarization(sentence: str, ratio=0.5):
return _sum

@staticmethod
def cosion_sismilarity(pre, aft):
def cosion_similarity(pre, aft):
"""
基于余弦计算文本相似性 0 - 1 (1为最相似)
:return: 余弦值
Expand All @@ -187,6 +189,16 @@ def cosion_sismilarity(pre, aft):
_sim = _cos.similarity(pre, aft)
return _sim

@staticmethod
def edit_similarity(pre, aft):
"""
基于余弦计算文本相似性 0 - 1 (1为最相似)
:return: 余弦值
"""
_cos = EditSimilarity()
_sim = _cos.edit_dist(pre, aft)
return _sim

@staticmethod
def simhash_similarity(pre, aft):
"""
Expand Down
Loading

0 comments on commit f09433c

Please sign in to comment.