Skip to content

Commit

Permalink
Merge pull request #16 from the-ai-team/client/12/output_data
Browse files Browse the repository at this point in the history
[Solved #12] Build summary output
  • Loading branch information
chamodperera committed Mar 3, 2023
1 parent 21080a3 commit 1b9b543
Show file tree
Hide file tree
Showing 10 changed files with 193 additions and 69 deletions.
3 changes: 3 additions & 0 deletions client/components.css
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
iframe{
height:auto;
}
94 changes: 94 additions & 0 deletions client/components.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import streamlit as st
import streamlit.components.v1 as components
import randomcolor
import random
from css_importer import local_css

rand_color = randomcolor.RandomColor()
colors = []
for i in range(10):
colors.append(rand_color.generate(luminosity="bright"))

def Generate_Components(data):
cards = ""
for item in data:
cards += generate_card(item)

components.html(
f"""
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@500;600;700&display=swap" rel="stylesheet">
<style>
.container{{
font-family: 'Inter', sans-serif;
font-weight: 700;
background: white;
padding: 10px
}}
.diagrams{{
padding: 40px;
position: relative;
overflow: hidden;
box-sizing: border-box;
background: white;
border-radius: 10px;
}}
.diagrams img{{
width: 100%;
margin-block: 20px;
}}
</style>
<div class="container">
{cards}
</div>
""",
height=900,
scrolling=True
)

def generate_card(item):
number = random.random()
id = str(number)[2:]
color = rand_color.generate(luminosity="light")[0]

diagrams_html = ""
if "Diagrams" in item:
diagram = item["Diagrams"]
diagrams_html = generate_diagrams(diagram)

card = f"""
<style>
.card-{id}{{
background-color: {color};
padding: 40px;
border-radius: 10px;
margin: 10px;
}}
</style>
<div class="card-{id}">
<h2>
{item["Title"]}
</h2>
<p>
{item["Content"]}
</p>
<div>
{diagrams_html if diagrams_html else ""}
</div>
</div>
"""
return card

def generate_diagrams(diagram):
type = diagram["Type"]
if type == "img":
images = diagram["Figure"]

html = """<div class="diagrams">"""
for image in images:
html += f"""
<img src={image} />
"""
html += "</div>"
return html
10 changes: 10 additions & 0 deletions client/css_importer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import os
import streamlit as st

# Adding styles
script_dir = os.path.dirname(__file__)

def local_css(file_name):
abs_path_dir = os.path.join(script_dir, file_name)
with open(abs_path_dir) as f:
st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
34 changes: 21 additions & 13 deletions client/main.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,28 @@
import streamlit as st
import os
import sys

import streamlit as st
st.set_page_config(page_title="Paper.sum", page_icon=":page_facing_up:")
st.title("Paper.sum")

st.set_page_config(page_title="SummarizeIt", page_icon=":page_facing_up:")
st.title("Summarize it")
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from server.main import Generate_summary
from components import Generate_Components
from sampledata import sample

# Adding styles
script_dir = os.path.dirname(__file__)
def local_css(file_name):
abs_path_dir = os.path.join(script_dir, file_name)
with open(abs_path_dir) as f:
st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
from css_importer import local_css
local_css("style.css")

st.text_input("Input URL to Research Paper")
selectable_tags = ['Beginner', 'Sample Tag', 'Tag2', 'Tag3']
default_tags = ['Beginner']
tags = st.multiselect(
link = st.text_input("Input URL to Research Paper")
selectable_tags = ['Experiments', 'Results', 'Beginner']
default_tags = ['Experiments']
selected_tags = st.multiselect(
'Select tags',
selectable_tags, default_tags)

if link:
if st.button('Summarize'):
summary = Generate_summary(link, selected_tags)
Generate_Components(summary)
else:
st.markdown('### Click on summarize to generate')
10 changes: 10 additions & 0 deletions client/sampledata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
sample = [{'Title': ' Experiments ',
'Content': 'The standard 10-crop testing is adopted for comparison studies in testing. The fully-convolutional form is adopted for best results, and the scores are averaged at multiple scales with images being resized such that the shorter side is in {224,256,384,480,640}. The ImageNet 2012 classification dataset is evaluated, which consists of 1000 classes and is trained on the 1.28 million training images and evaluated on the 50k validation images. The 100k test images are reported by the test server, and the top-1 and top-5 error rates are evaluated.'},
{'Title': ' Plain Networks ', 'Content': '18-layer and 34-layer plain nets are evaluated. The 34-layer plain net is in Fig.\xa03 (middle) and the 18-layer plain net is of a similar form. The results show that the deeper 34-layer plain net has higher validation error than the shallower 18-layer plain net. The training/validation errors are compared in Fig.\xa04 (left) and the degradation problem is observed, where the 34-layer plain net has higher training error throughout the whole training procedure. It is argued that this optimization difficulty is unlikely to be caused by vanishing gradients. The 34-layer plain net is still able to achieve competitive accuracy, suggesting that the solver works to some extent.',
'Diagrams': {'Type': 'img', 'Figure': ['https://ar5iv.labs.arxiv.org/html/1512.03385/assets/x1.png', 'https://ar5iv.labs.arxiv.org/html/1512.03385/assets/x7.png'],
'Description': 'Figure 1: Training error (left) and test error (right) on CIFAR-10 with 20-layer and 56-layer “plain” networks. The deeper network has higher training error, and thus test error. Similar phenomena on ImageNet is presented in Fig.\xa04.'}},
{'Title': ' Residual Networks ', 'Content': '18-layer and 34-layer residual nets (ResNets) are evaluated. The baseline architectures are the same as the above plain nets, except that a shortcut connection is added to each pair of 3×\\times×3 filters as in Fig.\xa03 (right). Identity mapping is used for all shortcuts and zero-padding is used for increasing dimensions (option A). The situation is reversed with residual learning – the 34-layer ResNet is better than the 18-layer ResNet (by 2.8%). The 34-layer ResNet exhibits considerably lower training error and is generalizable to the validation data, indicating that the degradation problem is well addressed in this setting. Compared to its plain counterpart, the 34-layer ResNet reduces the top-1 error by 3.5%. The 18-layer plain/residual nets are comparably accurate, but the 18-layer ResNet converges faster.'},
{'Title': ' Conv Feature Maps ', 'Content': 'Conv feature maps are computed on an image pyramid, where the image’s shorter sides are s∈{200,400,600,800,1000}. Two adjacent scales from the pyramid are selected following , and RoI pooling and subsequent layers are performed on the feature maps of these two scales, which are merged by maxout as in . Multi-scale testing improves the mAP by over 2 points.'},
{'Title': ' Validation Data ', 'Content': 'The 80k+40k trainval set is used for training and the 20k test-dev set is used for evaluation. The result is reported by the evaluation server and the [email protected] is 55.7% and the mAP@[.5, .95] is 34.9%. An ensemble of 3 networks is used to boost both tasks, resulting in an mAP of 59.0% and 37.4% on the test-dev set.'},
{'Title': ' PASCAL VOC ', 'Content': 'The model from the COCO dataset (55.7% [email protected] in Table\xa09) is fine-tuned on the PASCAL VOC sets. The improvements of box refinement, context, and multi-scale testing are also adopted. 85.6% mAP is achieved on PASCAL VOC 2007 (Table\xa011) and 83.8% on PASCAL VOC 2012 (Table\xa011).'},{'Title': ' ImageNet Detection ', 'Content': 'The ImageNet Detection (DET) task involves 200 object categories and is evaluated by [email protected]. The networks are pre-trained on the 1000-class ImageNet classification set, and are fine-tuned on the DET data. The val2 set is used for validation. The single model with ResNet-101 has 58.8% mAP and the ensemble of 3 models has 62.1% mAP on the DET test set (Table\xa012).'},
{'Title': ' ImageNet Localization ', 'Content': 'The ImageNet Localization (LOC) task requires to classify and localize the objects. The image-level classifiers are first adopted for predicting the class labels of an image, and the localization algorithm only accounts for predicting bounding boxes based on the predicted classes.'}]
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,5 @@ python-dotenv==1.0.0
requests==2.28.2
streamlit==1.18.1
tiktoken==0.2.0
randomcolor~=0.4.4.6
setuptools~=63.2.0
Empty file added server/__init__.py
Empty file.
103 changes: 50 additions & 53 deletions server/generator.py
Original file line number Diff line number Diff line change
@@ -1,66 +1,51 @@
from embeddings import openai
from embeddings import Embeddings
from server.embeddings import openai
from server.embeddings import Embeddings
import pandas as pd
import concurrent.futures

from openai.embeddings_utils import distances_from_embeddings


def match_context(
keywords, df, max_len=1800
):
def get_related_info(keyword,context):
"""
match context for a keyword by finding the most similar context from the dataframe
Extract related information from the context
"""

k_embeddings = openai.Embedding.create(input=keywords, engine='text-embedding-ada-002')['data'][0]['embedding'] # Get the embeddings for the keyword
df['Distances'] = distances_from_embeddings(k_embeddings, df['Embeddings'].values, distance_metric='cosine') # Get the distances from the embeddings

returns = []
cur_len = 0

for i, row in df.sort_values('Distances', ascending=True).iterrows(): # Sort by distance and add the text to the context until the context is too long

cur_len += row['N_tokens'] + 4 # Add the length of the text to the current length

if cur_len > max_len: # If the context is too long, break
break

returns.append(row["Text"]) # Else add it to the text that is being returned

return "\n\n###\n\n".join(returns) # Return the context
response = openai.ChatCompletion.create( # Create a completions using the keyword and context
messages = [{
"role":"user",
"content": f"""
Extract information most related to {keyword} of the following context which was taken from a research paper\n
context : {context}\n
points :
"""
}],
temperature=0.5,
max_tokens = 1024,
top_p=1,
frequency_penalty=0,
presence_penalty=0,
model="gpt-3.5-turbo",
)
return response["choices"][0]["message"]["content"].strip()

def generate_content(
df,
context,
keyword,
model="gpt-3.5-turbo",
max_len=1800,
debug=False,
stop_sequence=None
):
"""
Generate content based on the most similar context from the dataframe texts
Generate content based on the generated points of the paper
"""
context = match_context(
keyword,
df,
max_len=max_len
)
if debug:
print("Context:\n" + context) # If debug, print the raw model response
print("\n\n")

try:
response = openai.ChatCompletion.create( # Create a completions using the keyword and context
response = openai.ChatCompletion.create( # Create a completions using the keyword and context
messages = [{
"role":"user",
"content": f"""
generate a structured document under generated subtopics for the following context that extracts the information related on {keyword}.\n Use maximum of 5 subtopics\n.
use readable notation\n\n
context : {context}\n\n
use this format\n
## Generated Subtopic ##\n
<Generated paragraph of the subtopic>\n\n
structured document in passive voice:
"""
"content": f"""Summarize the following context which is from a research paper under suitable subtopics exclusively related to {keyword}.\n\n
use this format,\n
## generated subtopic ##\n
<Summarized paragraph under the subtopic>\n\n
context: {context}
"""
}],
temperature=0.5,
max_tokens = 2048,
Expand All @@ -70,10 +55,8 @@ def generate_content(
stop=stop_sequence,
model=model,
)
return response["choices"][0]["message"]["content"].strip()
except Exception as e:
print(e)
return ""
return response["choices"][0]["message"]["content"].strip()



def content_dict(txt):
Expand All @@ -89,7 +72,7 @@ def content_dict(txt):

return dict

def match_diagrams(diagrams_df,generated_content_dict,threshold = 0.12):
def match_diagrams(diagrams_df,generated_content_dict,threshold = 0.15):
"""
match diagrams for each generated section
"""
Expand All @@ -109,7 +92,21 @@ def Generate(content_df,diagrams_df,keyword):
"""
Main function for generating
"""
generated_content = generate_content(content_df,keyword=keyword) # generate content
information = [None] * len(content_df) # Initialize the list with None values
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = {}
for i, row in content_df.iterrows():
context = row["Text"]
keyword = keyword
futures[executor.submit(get_related_info, context, keyword)] = i # Use a dictionary to associate each future with an index
for future in concurrent.futures.as_completed(futures):
i = futures[future] # Get the index associated with the completed future
information[i] = future.result() # Add the result to the appropriate index in the list

related_information = ("\n").join(information)

generated_content = generate_content(related_information,keyword)

generated_content_dict = content_dict(generated_content) # create dictionary

generated_content_dict = match_diagrams(diagrams_df,generated_content_dict) # match diagrams
Expand Down
4 changes: 2 additions & 2 deletions server/main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from scraper import *
from server.scraper import *
from embeddings import *
from generator import *
from server.generator import *

def Generate_summary(url,keyword):
"""
Expand Down
2 changes: 1 addition & 1 deletion server/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def clean_text(raw_text):
clean_text = re.sub(r'\[\d+(?:,\s*\d+)*\]', '', raw_text).replace('\n',' ') #remove citations and new line characters
return clean_text

def create_chunks(context,max_tokens=768):
def create_chunks(context,max_tokens=2048):
"""
create chunks of cintext which is suitable to pass to the model
"""
Expand Down

0 comments on commit 1b9b543

Please sign in to comment.