src.subpages.find_duplicates
Find potential duplicates in the data using cosine similarity.
1"""Find potential duplicates in the data using cosine similarity.""" 2import streamlit as st 3from sentence_transformers.util import cos_sim 4 5from src.subpages.page import Context, Page 6 7 8@st.cache() 9def get_sims(texts: list[str], sentence_encoder): 10 embeddings = sentence_encoder.encode(texts, batch_size=8, convert_to_numpy=True) 11 return cos_sim(embeddings, embeddings) 12 13 14class FindDuplicatesPage(Page): 15 name = "Find Duplicates" 16 icon = "fingerprint" 17 18 def get_widget_defaults(self): 19 return { 20 "cutoff": 0.95, 21 } 22 23 def render(self, context: Context): 24 st.title("Find Duplicates") 25 with st.expander("💡", expanded=True): 26 st.write("Find potential duplicates in the data using cosine similarity.") 27 28 cutoff = st.slider("Similarity threshold", min_value=0.0, max_value=1.0, key="cutoff") 29 # split.add_faiss_index(column="embeddings", index_name="sent_index") 30 # st.write("Index is ready") 31 # sentence_encoder.encode(["hello world"], batch_size=8) 32 # st.write(split["tokens"][0]) 33 texts = [" ".join(ts) for ts in context.split["tokens"]] 34 sims = get_sims(texts, context.sentence_encoder) 35 36 candidates = [] 37 for i in range(len(sims)): 38 for j in range(i + 1, len(sims)): 39 if sims[i][j] >= cutoff: 40 candidates.append((sims[i][j], i, j)) 41 candidates.sort(reverse=False) 42 43 for (sim, i, j) in candidates[:100]: 44 st.markdown(f"**Possible duplicate ({i}, {j}, sim: {sim:.3f}):**") 45 st.markdown("* " + " ".join(context.split["tokens"][i])) 46 st.markdown("* " + " ".join(context.split["tokens"][j])) 47 48 # st.write("queries") 49 # results = split.get_nearest_examples("sent_index", np.array(split["embeddings"][0], dtype=np.float32), k=2) 50 # results = split.get_nearest_examples_batch("sent_index", queries, k=2) 51 # st.write(results.total_examples[0]["id"][1]) 52 # st.write(results.total_examples[0])
@st.cache()
def
get_sims(texts: list[str], sentence_encoder)
15class FindDuplicatesPage(Page): 16 name = "Find Duplicates" 17 icon = "fingerprint" 18 19 def get_widget_defaults(self): 20 return { 21 "cutoff": 0.95, 22 } 23 24 def render(self, context: Context): 25 st.title("Find Duplicates") 26 with st.expander("💡", expanded=True): 27 st.write("Find potential duplicates in the data using cosine similarity.") 28 29 cutoff = st.slider("Similarity threshold", min_value=0.0, max_value=1.0, key="cutoff") 30 # split.add_faiss_index(column="embeddings", index_name="sent_index") 31 # st.write("Index is ready") 32 # sentence_encoder.encode(["hello world"], batch_size=8) 33 # st.write(split["tokens"][0]) 34 texts = [" ".join(ts) for ts in context.split["tokens"]] 35 sims = get_sims(texts, context.sentence_encoder) 36 37 candidates = [] 38 for i in range(len(sims)): 39 for j in range(i + 1, len(sims)): 40 if sims[i][j] >= cutoff: 41 candidates.append((sims[i][j], i, j)) 42 candidates.sort(reverse=False) 43 44 for (sim, i, j) in candidates[:100]: 45 st.markdown(f"**Possible duplicate ({i}, {j}, sim: {sim:.3f}):**") 46 st.markdown("* " + " ".join(context.split["tokens"][i])) 47 st.markdown("* " + " ".join(context.split["tokens"][j])) 48 49 # st.write("queries") 50 # results = split.get_nearest_examples("sent_index", np.array(split["embeddings"][0], dtype=np.float32), k=2) 51 # results = split.get_nearest_examples_batch("sent_index", queries, k=2) 52 # st.write(results.total_examples[0]["id"][1]) 53 # st.write(results.total_examples[0])
Base class for all pages.
def
get_widget_defaults(self)
This function holds the default settings for all the page's widgets.
Returns
dict: A dictionary of widget defaults, where the keys are the widget names and the values are the default.
24 def render(self, context: Context): 25 st.title("Find Duplicates") 26 with st.expander("💡", expanded=True): 27 st.write("Find potential duplicates in the data using cosine similarity.") 28 29 cutoff = st.slider("Similarity threshold", min_value=0.0, max_value=1.0, key="cutoff") 30 # split.add_faiss_index(column="embeddings", index_name="sent_index") 31 # st.write("Index is ready") 32 # sentence_encoder.encode(["hello world"], batch_size=8) 33 # st.write(split["tokens"][0]) 34 texts = [" ".join(ts) for ts in context.split["tokens"]] 35 sims = get_sims(texts, context.sentence_encoder) 36 37 candidates = [] 38 for i in range(len(sims)): 39 for j in range(i + 1, len(sims)): 40 if sims[i][j] >= cutoff: 41 candidates.append((sims[i][j], i, j)) 42 candidates.sort(reverse=False) 43 44 for (sim, i, j) in candidates[:100]: 45 st.markdown(f"**Possible duplicate ({i}, {j}, sim: {sim:.3f}):**") 46 st.markdown("* " + " ".join(context.split["tokens"][i])) 47 st.markdown("* " + " ".join(context.split["tokens"][j])) 48 49 # st.write("queries") 50 # results = split.get_nearest_examples("sent_index", np.array(split["embeddings"][0], dtype=np.float32), k=2) 51 # results = split.get_nearest_examples_batch("sent_index", queries, k=2) 52 # st.write(results.total_examples[0]["id"][1]) 53 # st.write(results.total_examples[0])
This function renders the page.