Merge pull request #16 from the-ai-team/client/12/output_data

[Solved #12] Build summary output
the-ai-team · Mar 3, 2023 · 1b9b543 · 1b9b543
1 parent 21080a3
commit 1b9b543
Show file tree

Hide file tree

Showing 10 changed files with 193 additions and 69 deletions.
diff --git a/client/components.css b/client/components.css
@@ -0,0 +1,3 @@
+iframe{
+    height:auto;
+}
diff --git a/client/components.py b/client/components.py
@@ -0,0 +1,94 @@
+import streamlit as st
+import streamlit.components.v1 as components
+import randomcolor
+import random
+from css_importer import local_css
+
+rand_color = randomcolor.RandomColor()
+colors = []
+for i in range(10):
+    colors.append(rand_color.generate(luminosity="bright"))
+
+def Generate_Components(data):
+    cards = ""
+    for item in data:
+        cards += generate_card(item)
+
+    components.html(
+        f"""
+            <link rel="preconnect" href="https://fonts.googleapis.com">
+            <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+            <link href="https://fonts.googleapis.com/css2?family=Inter:wght@500;600;700&display=swap" rel="stylesheet">
+                <style>
+                    .container{{
+                        font-family: 'Inter', sans-serif;
+                        font-weight: 700;                    
+                        background: white;
+                        padding: 10px
+                    }}
+                    .diagrams{{
+                        padding: 40px;
+                        position: relative;
+                        overflow: hidden;
+                        box-sizing: border-box;
+                        background: white;
+                        border-radius: 10px;
+                    }}                    
+                    .diagrams img{{
+                        width: 100%; 
+                        margin-block: 20px;                  
+                    }}
+                </style>
+                <div class="container">
+                    {cards}
+                </div>
+                """,
+        height=900,
+        scrolling=True
+    )
+
+def generate_card(item):
+    number = random.random()
+    id = str(number)[2:]
+    color = rand_color.generate(luminosity="light")[0]
+
+    diagrams_html = ""
+    if "Diagrams" in item:
+        diagram = item["Diagrams"]
+        diagrams_html = generate_diagrams(diagram)
+
+    card = f"""
+            <style>
+            .card-{id}{{               
+                background-color: {color};
+                padding: 40px;
+                border-radius: 10px;
+                margin: 10px;
+            }}            
+            </style>
+            <div class="card-{id}">
+                <h2>
+                    {item["Title"]}
+                </h2>
+                <p>
+                    {item["Content"]}
+                </p>
+                <div>
+                    {diagrams_html if diagrams_html else ""}
+                </div>
+            </div>
+            """
+    return card
+
+def generate_diagrams(diagram):
+    type = diagram["Type"]
+    if type == "img":
+        images = diagram["Figure"]
+
+        html = """<div class="diagrams">"""
+        for image in images:
+            html += f"""
+                <img src={image} />
+            """
+        html += "</div>"
+        return html
diff --git a/client/css_importer.py b/client/css_importer.py
@@ -0,0 +1,10 @@
+import os
+import streamlit as st
+
+# Adding styles
+script_dir = os.path.dirname(__file__)
+
+def local_css(file_name):
+    abs_path_dir = os.path.join(script_dir, file_name)
+    with open(abs_path_dir) as f:
+        st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
diff --git a/client/main.py b/client/main.py
@@ -1,20 +1,28 @@
-import streamlit as st
 import os
+import sys
+
+import streamlit as st
+st.set_page_config(page_title="Paper.sum", page_icon=":page_facing_up:")
+st.title("Paper.sum")
 
-st.set_page_config(page_title="SummarizeIt", page_icon=":page_facing_up:")
-st.title("Summarize it")
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+from server.main import Generate_summary
+from components import Generate_Components
+from sampledata import sample
 
-# Adding styles
-script_dir = os.path.dirname(__file__)
-def local_css(file_name):
-    abs_path_dir = os.path.join(script_dir, file_name)
-    with open(abs_path_dir) as f:
-        st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
+from css_importer import local_css
 local_css("style.css")
 
-st.text_input("Input URL to Research Paper")
-selectable_tags = ['Beginner', 'Sample Tag', 'Tag2', 'Tag3']
-default_tags = ['Beginner']
-tags = st.multiselect(
+link = st.text_input("Input URL to Research Paper")
+selectable_tags = ['Experiments', 'Results', 'Beginner']
+default_tags = ['Experiments']
+selected_tags = st.multiselect(
     'Select tags',
     selectable_tags, default_tags)
+
+if link:
+    if st.button('Summarize'):
+        summary = Generate_summary(link, selected_tags)
+        Generate_Components(summary)
+    else:
+        st.markdown('### Click on summarize to generate')
diff --git a/client/sampledata.py b/client/sampledata.py
@@ -0,0 +1,10 @@
+sample = [{'Title': ' Experiments ',
+           'Content': 'The standard 10-crop testing is adopted for comparison studies in testing. The fully-convolutional form is adopted for best results, and the scores are averaged at multiple scales with images being resized such that the shorter side is in {224,256,384,480,640}. The ImageNet 2012 classification dataset is evaluated, which consists of 1000 classes and is trained on the 1.28 million training images and evaluated on the 50k validation images. The 100k test images are reported by the test server, and the top-1 and top-5 error rates are evaluated.'},
+          {'Title': ' Plain Networks ', 'Content': '18-layer and 34-layer plain nets are evaluated. The 34-layer plain net is in Fig.\xa03 (middle) and the 18-layer plain net is of a similar form. The results show that the deeper 34-layer plain net has higher validation error than the shallower 18-layer plain net. The training/validation errors are compared in Fig.\xa04 (left) and the degradation problem is observed, where the 34-layer plain net has higher training error throughout the whole training procedure. It is argued that this optimization difficulty is unlikely to be caused by vanishing gradients. The 34-layer plain net is still able to achieve competitive accuracy, suggesting that the solver works to some extent.',
+           'Diagrams': {'Type': 'img', 'Figure': ['https://ar5iv.labs.arxiv.org/html/1512.03385/assets/x1.png', 'https://ar5iv.labs.arxiv.org/html/1512.03385/assets/x7.png'],
+           'Description': 'Figure 1: Training error (left) and test error (right) on CIFAR-10 with 20-layer and 56-layer “plain” networks. The deeper network has higher training error, and thus test error. Similar phenomena on ImageNet is presented in Fig.\xa04.'}},
+          {'Title': ' Residual Networks ', 'Content': '18-layer and 34-layer residual nets (ResNets) are evaluated. The baseline architectures are the same as the above plain nets, except that a shortcut connection is added to each pair of 3×\\times×3 filters as in Fig.\xa03 (right). Identity mapping is used for all shortcuts and zero-padding is used for increasing dimensions (option A). The situation is reversed with residual learning – the 34-layer ResNet is better than the 18-layer ResNet (by 2.8%). The 34-layer ResNet exhibits considerably lower training error and is generalizable to the validation data, indicating that the degradation problem is well addressed in this setting. Compared to its plain counterpart, the 34-layer ResNet reduces the top-1 error by 3.5%. The 18-layer plain/residual nets are comparably accurate, but the 18-layer ResNet converges faster.'},
+          {'Title': ' Conv Feature Maps ', 'Content': 'Conv feature maps are computed on an image pyramid, where the image’s shorter sides are s∈{200,400,600,800,1000}. Two adjacent scales from the pyramid are selected following , and RoI pooling and subsequent layers are performed on the feature maps of these two scales, which are merged by maxout as in . Multi-scale testing improves the mAP by over 2 points.'},
+          {'Title': ' Validation Data ', 'Content': 'The 80k+40k trainval set is used for training and the 20k test-dev set is used for evaluation. The result is reported by the evaluation server and the [email protected] is 55.7% and the mAP@[.5, .95] is 34.9%. An ensemble of 3 networks is used to boost both tasks, resulting in an mAP of 59.0% and 37.4% on the test-dev set.'},
+          {'Title': ' PASCAL VOC ', 'Content': 'The model from the COCO dataset (55.7% [email protected] in Table\xa09) is fine-tuned on the PASCAL VOC sets. The improvements of box refinement, context, and multi-scale testing are also adopted. 85.6% mAP is achieved on PASCAL VOC 2007 (Table\xa011) and 83.8% on PASCAL VOC 2012 (Table\xa011).'},{'Title': ' ImageNet Detection ', 'Content': 'The ImageNet Detection (DET) task involves 200 object categories and is evaluated by [email protected]. The networks are pre-trained on the 1000-class ImageNet classification set, and are fine-tuned on the DET data. The val2 set is used for validation. The single model with ResNet-101 has 58.8% mAP and the ensemble of 3 models has 62.1% mAP on the DET test set (Table\xa012).'},
+          {'Title': ' ImageNet Localization ', 'Content': 'The ImageNet Localization (LOC) task requires to classify and localize the objects. The image-level classifiers are first adopted for predicting the class labels of an image, and the localization algorithm only accounts for predicting bounding boxes based on the predicted classes.'}]
diff --git a/requirements.txt b/requirements.txt
@@ -5,3 +5,5 @@ python-dotenv==1.0.0
 requests==2.28.2
 streamlit==1.18.1
 tiktoken==0.2.0
+randomcolor~=0.4.4.6
+setuptools~=63.2.0
diff --git a/server/__init__.py b/server/__init__.py
diff --git a/server/generator.py b/server/generator.py
@@ -1,66 +1,51 @@
-from embeddings import openai
-from embeddings import Embeddings
+from server.embeddings import openai
+from server.embeddings import Embeddings
 import pandas as pd
+import concurrent.futures
+
 from openai.embeddings_utils import distances_from_embeddings
 
 
-def match_context(
-    keywords, df, max_len=1800
-):
+def get_related_info(keyword,context):
     """
-    match context for a keyword by finding the most similar context from the dataframe
+    Extract related information from the context
     """
-
-    k_embeddings = openai.Embedding.create(input=keywords, engine='text-embedding-ada-002')['data'][0]['embedding'] # Get the embeddings for the keyword
-    df['Distances'] = distances_from_embeddings(k_embeddings, df['Embeddings'].values, distance_metric='cosine')  # Get the distances from the embeddings
-
-    returns = []
-    cur_len = 0
-
-    for i, row in df.sort_values('Distances', ascending=True).iterrows():  # Sort by distance and add the text to the context until the context is too long
-
-        cur_len += row['N_tokens'] + 4 # Add the length of the text to the current length
-
-        if cur_len > max_len: # If the context is too long, break
-            break
-
-        returns.append(row["Text"]) # Else add it to the text that is being returned
-
-    return "\n\n###\n\n".join(returns)  # Return the context
+    response = openai.ChatCompletion.create(  # Create a completions using the keyword and context
+            messages = [{
+                "role":"user",
+                "content": f"""
+                    Extract information most related to {keyword} of the following context which was taken from a research paper\n
+                    context : {context}\n
+                    points :
+                    """
+            }],
+            temperature=0.5,
+            max_tokens = 1024,
+            top_p=1,
+            frequency_penalty=0,
+            presence_penalty=0,
+            model="gpt-3.5-turbo",
+        )
+    return response["choices"][0]["message"]["content"].strip()
 
 def generate_content(
-    df,
+    context,
     keyword,
     model="gpt-3.5-turbo",
-    max_len=1800,
-    debug=False,
     stop_sequence=None
 ):
     """
-    Generate content based on the most similar context from the dataframe texts
+    Generate content based on the generated points of the paper
     """
-    context = match_context(
-        keyword,
-        df,
-        max_len=max_len
-    )
-    if debug:
-        print("Context:\n" + context) # If debug, print the raw model response
-        print("\n\n")
-
-    try: 
-        response = openai.ChatCompletion.create(  # Create a completions using the keyword and context
+    response = openai.ChatCompletion.create(  # Create a completions using the keyword and context
             messages = [{
                 "role":"user",
-                "content": f"""
-                    generate a structured document under generated subtopics for the following context that extracts the information related on {keyword}.\n Use maximum of 5 subtopics\n.
-                    use readable notation\n\n
-                    context : {context}\n\n
-                    use this format\n
-                    ## Generated Subtopic ##\n
-                    <Generated paragraph of the subtopic>\n\n
-                    structured document in passive voice:
-                    """
+                "content": f"""Summarize the following context which is from a research paper under suitable subtopics exclusively related to {keyword}.\n\n
+                use this format,\n
+                ## generated subtopic ##\n
+                <Summarized paragraph under the subtopic>\n\n
+                context: {context}    
+                 """
             }],
             temperature=0.5,
             max_tokens = 2048,
@@ -70,10 +55,8 @@ def generate_content(
             stop=stop_sequence,
             model=model,
         )
-        return response["choices"][0]["message"]["content"].strip()
-    except Exception as e:
-        print(e)
-        return ""
+    return response["choices"][0]["message"]["content"].strip()
+
 
 
 def content_dict(txt):
@@ -89,7 +72,7 @@ def content_dict(txt):
 
     return dict
 
-def match_diagrams(diagrams_df,generated_content_dict,threshold = 0.12):
+def match_diagrams(diagrams_df,generated_content_dict,threshold = 0.15):
     """
     match diagrams for each generated section
     """
@@ -109,7 +92,21 @@ def Generate(content_df,diagrams_df,keyword):
     """
     Main function for generating
     """
-    generated_content = generate_content(content_df,keyword=keyword) # generate content
+    information = [None] * len(content_df)  # Initialize the list with None values
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        futures = {}
+        for i, row in content_df.iterrows():
+            context = row["Text"]
+            keyword = keyword
+            futures[executor.submit(get_related_info, context, keyword)] = i  # Use a dictionary to associate each future with an index
+        for future in concurrent.futures.as_completed(futures):
+            i = futures[future]  # Get the index associated with the completed future
+            information[i] = future.result()  # Add the result to the appropriate index in the list
+
+    related_information = ("\n").join(information)
+
+    generated_content = generate_content(related_information,keyword)
+
     generated_content_dict = content_dict(generated_content) # create dictionary
 
     generated_content_dict = match_diagrams(diagrams_df,generated_content_dict) # match diagrams

diff --git a/server/main.py b/server/main.py
@@ -1,6 +1,6 @@
-from scraper import *
+from server.scraper import *
 from embeddings import *
-from generator import *
+from server.generator import *
 
 def Generate_summary(url,keyword):
     """

diff --git a/server/scraper.py b/server/scraper.py
@@ -29,7 +29,7 @@ def clean_text(raw_text):
     clean_text = re.sub(r'\[\d+(?:,\s*\d+)*\]', '', raw_text).replace('\n',' ') #remove citations and new line characters
     return clean_text
 
-def create_chunks(context,max_tokens=768):
+def create_chunks(context,max_tokens=2048):
     """
     create chunks of cintext which is suitable to pass to the model
     """