src.subpages.find_duplicates

Find potential duplicates in the data using cosine similarity.

View Source

 1"""Find potential duplicates in the data using cosine similarity."""
 2import streamlit as st
 3from sentence_transformers.util import cos_sim
 4
 5from src.subpages.page import Context, Page
 6
 7
 8@st.cache()
 9def get_sims(texts: list[str], sentence_encoder):
10    embeddings = sentence_encoder.encode(texts, batch_size=8, convert_to_numpy=True)
11    return cos_sim(embeddings, embeddings)
12
13
14class FindDuplicatesPage(Page):
15    name = "Find Duplicates"
16    icon = "fingerprint"
17
18    def get_widget_defaults(self):
19        return {
20            "cutoff": 0.95,
21        }
22
23    def render(self, context: Context):
24        st.title("Find Duplicates")
25        with st.expander("💡", expanded=True):
26            st.write("Find potential duplicates in the data using cosine similarity.")
27
28        cutoff = st.slider("Similarity threshold", min_value=0.0, max_value=1.0, key="cutoff")
29        # split.add_faiss_index(column="embeddings", index_name="sent_index")
30        # st.write("Index is ready")
31        # sentence_encoder.encode(["hello world"], batch_size=8)
32        # st.write(split["tokens"][0])
33        texts = [" ".join(ts) for ts in context.split["tokens"]]
34        sims = get_sims(texts, context.sentence_encoder)
35
36        candidates = []
37        for i in range(len(sims)):
38            for j in range(i + 1, len(sims)):
39                if sims[i][j] >= cutoff:
40                    candidates.append((sims[i][j], i, j))
41        candidates.sort(reverse=False)
42
43        for (sim, i, j) in candidates[:100]:
44            st.markdown(f"**Possible duplicate ({i}, {j}, sim: {sim:.3f}):**")
45            st.markdown("* " + " ".join(context.split["tokens"][i]))
46            st.markdown("* " + " ".join(context.split["tokens"][j]))
47
48        # st.write("queries")
49        # results = split.get_nearest_examples("sent_index", np.array(split["embeddings"][0], dtype=np.float32), k=2)
50        # results = split.get_nearest_examples_batch("sent_index", queries, k=2)
51        # st.write(results.total_examples[0]["id"][1])
52        # st.write(results.total_examples[0])

@st.cache()

def get_sims(texts: list[str], sentence_encoder) View Source

 9@st.cache()
10def get_sims(texts: list[str], sentence_encoder):
11    embeddings = sentence_encoder.encode(texts, batch_size=8, convert_to_numpy=True)
12    return cos_sim(embeddings, embeddings)

class FindDuplicatesPage(src.subpages.page.Page): View Source

15class FindDuplicatesPage(Page):
16    name = "Find Duplicates"
17    icon = "fingerprint"
18
19    def get_widget_defaults(self):
20        return {
21            "cutoff": 0.95,
22        }
23
24    def render(self, context: Context):
25        st.title("Find Duplicates")
26        with st.expander("💡", expanded=True):
27            st.write("Find potential duplicates in the data using cosine similarity.")
28
29        cutoff = st.slider("Similarity threshold", min_value=0.0, max_value=1.0, key="cutoff")
30        # split.add_faiss_index(column="embeddings", index_name="sent_index")
31        # st.write("Index is ready")
32        # sentence_encoder.encode(["hello world"], batch_size=8)
33        # st.write(split["tokens"][0])
34        texts = [" ".join(ts) for ts in context.split["tokens"]]
35        sims = get_sims(texts, context.sentence_encoder)
36
37        candidates = []
38        for i in range(len(sims)):
39            for j in range(i + 1, len(sims)):
40                if sims[i][j] >= cutoff:
41                    candidates.append((sims[i][j], i, j))
42        candidates.sort(reverse=False)
43
44        for (sim, i, j) in candidates[:100]:
45            st.markdown(f"**Possible duplicate ({i}, {j}, sim: {sim:.3f}):**")
46            st.markdown("* " + " ".join(context.split["tokens"][i]))
47            st.markdown("* " + " ".join(context.split["tokens"][j]))
48
49        # st.write("queries")
50        # results = split.get_nearest_examples("sent_index", np.array(split["embeddings"][0], dtype=np.float32), k=2)
51        # results = split.get_nearest_examples_batch("sent_index", queries, k=2)
52        # st.write(results.total_examples[0]["id"][1])
53        # st.write(results.total_examples[0])

Base class for all pages.

FindDuplicatesPage()

name: str = 'Find Duplicates'

icon: str = 'fingerprint'

def get_widget_defaults(self) View Source

19    def get_widget_defaults(self):
20        return {
21            "cutoff": 0.95,
22        }

This function holds the default settings for all the page's widgets.

Returns

dict: A dictionary of widget defaults, where the keys are the widget names and the values are the default.

def render(self, context: src.subpages.page.Context) View Source

24    def render(self, context: Context):
25        st.title("Find Duplicates")
26        with st.expander("💡", expanded=True):
27            st.write("Find potential duplicates in the data using cosine similarity.")
28
29        cutoff = st.slider("Similarity threshold", min_value=0.0, max_value=1.0, key="cutoff")
30        # split.add_faiss_index(column="embeddings", index_name="sent_index")
31        # st.write("Index is ready")
32        # sentence_encoder.encode(["hello world"], batch_size=8)
33        # st.write(split["tokens"][0])
34        texts = [" ".join(ts) for ts in context.split["tokens"]]
35        sims = get_sims(texts, context.sentence_encoder)
36
37        candidates = []
38        for i in range(len(sims)):
39            for j in range(i + 1, len(sims)):
40                if sims[i][j] >= cutoff:
41                    candidates.append((sims[i][j], i, j))
42        candidates.sort(reverse=False)
43
44        for (sim, i, j) in candidates[:100]:
45            st.markdown(f"**Possible duplicate ({i}, {j}, sim: {sim:.3f}):**")
46            st.markdown("* " + " ".join(context.split["tokens"][i]))
47            st.markdown("* " + " ".join(context.split["tokens"][j]))
48
49        # st.write("queries")
50        # results = split.get_nearest_examples("sent_index", np.array(split["embeddings"][0], dtype=np.float32), k=2)
51        # results = split.get_nearest_examples_batch("sent_index", queries, k=2)
52        # st.write(results.total_examples[0]["id"][1])
53        # st.write(results.total_examples[0])

This function renders the page.