-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
162 lines (144 loc) · 6.66 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# |‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾|
# | import |
# |_____________________________|
# import the required libraries
import streamlit as st
from transformers import pipeline
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DistilBertForSequenceClassification
import torch
import emoji
import time
from PIL import Image
from pathlib import Path
import pandas as pd
import numpy as np
import requests
# |‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾|
# | models |
# |_____________________________|
# initial model is defined here. Initial model is the distilbert base finetuned sstv2 english model
# using the transformers library, its easy to just call the package and it works
# technically not even needed since I reinstantiate these values after choosing the model
model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = AutoTokenizer.from_pretrained(
model_name, use_fast=False)
model = AutoModelForSequenceClassification.from_pretrained(
model_name) # not used
pl = pipeline("sentiment-analysis", model=model,
tokenizer=tokenizer, framework='pt')
# |‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾|
# | title |
# |_____________________________|
# ui elements on the page
# local media elements are stored in the /media folder.
st.markdown("# :red[Toxic] or Not.")
st.caption("An implementation of a tweet language analyzer.")
# st.divider()
# image
image = Image.open('media/L_two.png')
st.image(image)
# st.divider()
# |‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾|
# | choose the model |
# |_____________________________|
# option stores the string value of which model is chosen
# by default, 'distilbert-base-uncased-finetuned-sst-2-english' is chosen.
option = st.selectbox(
'What model would you like to use?',
('distilbert-base-uncased-finetuned-sst-2-english', 'fine-trained-distilbert')
)
# if we choose the first model
if option == 'distilbert-base-uncased-finetuned-sst-2-english':
# create pipeline
tokenizer = AutoTokenizer.from_pretrained(
"distilbert-base-uncased-finetuned-sst-2-english", use_fast=False)
model = AutoModelForSequenceClassification.from_pretrained(
"distilbert-base-uncased-finetuned-sst-2-english")
pl = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english",
tokenizer=tokenizer, framework='pt')
# sentiment analysis
# Text above the text box
# default value
input = st.text_area('Enter a phrase and press ctrl-enter to analyze it:',
'grrrr jappan 🇯🇵 is best country in teh world (sekai) !!!!🤬😡!!!👹🤬!!!!! west bAd grrrgghhhg japenis culture⛩🎎🎏 better than amrican🗽🍔👎!!! (>~<) vendor machine eveywhere 🗼and sakura trees are so 🌸 a e s t h e t i c 🌸 UwU if u hate it then your NOT a man of culture so shinē!!! ~hmph baka -_- 🏮')
result = pl(input)
# st.json(result)
# more ui elements to print out the score as well as whether it is NEGATIVE or POSITIVE
if result[0]["label"] == "NEGATIVE":
st.markdown(emoji.emojize("Text entry is negative :thumbsdown:"))
st.write(result[0]["score"])
st.warning("negative score :<", icon="⚠️")
elif result[0]["label"] == "POSITIVE":
st.markdown(emoji.emojize("Text entry is positive :thumbsup:"))
st.write(result[0]["score"])
st.success("positive score!")
st.balloons()
else:
st.markdown(emoji.emojize("something went wrong :x:"))
# apparently, switch/case doesn't work in the python parser on huggingface
# therefore, we use elif
elif option == 'fine-trained-distilbert':
# initialze values for API call
# query the api because i couldn't find a way to get the secrets to work
# please don't steal my api key q-q
def query(payload):
headers = {
"Authorization": f"Bearer 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"
}
API_URL = "https://api-inference.huggingface.co/models/davidchiii/fine-trained-distilbert"
response = requests.post(API_URL, headers=headers, json=payload)
return response.json()
# 🤨
# input area
input = st.text_area('Enter a phrase and press ctrl-enter to analyze it:',
'grrrr jappan 🇯🇵 is best country in teh world (sekai) !!!!🤬😡!!!👹🤬!!!!! west bAd grrrgghhhg japenis culture⛩🎎🎏 better than amrican🗽🍔👎!!! (>~<) vendor machine eveywhere 🗼and sakura trees are so 🌸 a e s t h e t i c 🌸 UwU if u hate it then your NOT a man of culture so shinē!!! ~hmph baka -_- 🏮')
# call query to request for the API to run the model
output = query({
"inputs": input,
})
# st.write(type(output[0]))
# output the labels
if not output:
pass
# write values to a dictionary to sort them
dict = {}
label_cols = ['toxic', 'severe_toxic', 'obscene',
'threat', 'insult', 'identity_hate']
# st.write(output[0][0]['label'])
# st.write(output[0][1]['label'])
# st.write(output[0][2]['label'])
# st.write(output[0][3]['label'])
# st.write(output[0][4]['label'])
# st.write(output[0][5]['label'])
# st.write(output)
# set values for local dictionary
for i, key in enumerate(output[0]):
if key['label'] == "LABEL_0":
dict["toxic"] = key["score"]
elif key['label'] == "LABEL_1":
dict["severe_toxic"] = key["score"]
elif key['label'] == "LABEL_2":
dict["obscene"] = key["score"]
elif key['label'] == "LABEL_3":
dict["threat"] = key["score"]
elif key['label'] == "LABEL_4":
dict["insult"] = key["score"]
elif key['label'] == "LABEL_5":
dict["identity_hate"] = key["score"]
# read through all dictionary values to determine the largest
largest = 'toxic'
for key in dict.keys():
if dict[largest] < dict[key]:
largest = key
# create and write the front end
col1, col2, col3 = st.columns(3)
with col1:
st.subheader("Text:")
st.write(input)
with col2:
st.subheader("Most Prevalent Label:")
# should be toxic for most of the time, but it's vague
st.write(largest)
with col3:
st.subheader("Value:")
st.write(dict)