cd /opt/notebook
python3 -m venv .venv
source .venv/bin/activate
pip install notebook ipykernel
code .

ruta_archivo = './sentences.json'

# Leer el archivo JSON como texto
with open(ruta_archivo, 'r', encoding='utf-8') as f:
    contenido = f.read()

# Contar caracteres (tamaño en bytes si es ASCII/UTF-8 simple)
num_caracteres = len(contenido)

print(f"El archivo tiene {num_caracteres} caracteres, lo que equivale a {num_caracteres} bytes ({num_caracteres * 8} bits).")

El archivo tiene 7211 caracteres, lo que equivale a 7211 bytes (57688 bits).

import json
import re

def rle_compress_word(word):
    """Compresor RLE para palabras (solo letras consecutivas)"""
    if not word:
        return word
        
    compressed = []     # Palabra comprimido
    prev_char = word[0] # Buscar repeticiones inmediatamente anteriores
    count = 1           # Número de repeticiones inmediatamente anteriores
    
    for char in word[1:]: # Empezar por la segunda letra de la palabra
        if char == prev_char and char.isalpha(): # Si es igual a la anterior, aumentar contador
            count += 1
        else: 
            # Si es distinta a la anterior, almacenar el anterior resultado, usando guarismo solo si hubo repetición
            compressed.append(f"{count}{prev_char}" if count > 1 else prev_char)
            count = 1
            prev_char = char
    
    # Almacenar resultado del último carácter
    compressed.append(f"{count}{prev_char}" if count > 1 else prev_char)
    
    return ''.join(compressed)

def compress_sentence(sentence):
    """Aplica RLE a cada palabra manteniendo puntuación"""
    tokens = re.findall(r"\w+|\s+|[^\w\s]", sentence)
    return ''.join([rle_compress_word(token) if token.strip() else token for token in tokens])

def process_json_file(input_path, output_path):
    """Procesa un archivo JSON completo"""
    with open(input_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # Comprime todas las frases
    compressed_data = [compress_sentence(sentence) for sentence in data]
    
    # Guarda el resultado en una sola línea
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(compressed_data, f, ensure_ascii=False) # sin indentación = sin saltos de línea
    
    print(f"Compresión completada. Resultado guardado en {output_path}")

input_json = 'sentences.json'      
output_json = 'sentences_rle.json'
process_json_file(input_json, output_json)

Compresión completada. Resultado guardado en sentences_rle.json

ruta_archivo = './sentences_rle.json'
with open(ruta_archivo, 'r', encoding='utf-8') as f:
    contenido = f.read()
num_caracteres = len(contenido)
print(f"El archivo tiene {num_caracteres} caracteres, lo que equivale a {num_caracteres} bytes ({num_caracteres * 8} bits).")

El archivo tiene 7211 caracteres, lo que equivale a 7211 bytes (57688 bits).

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import json

def svd_text_compress(texts, k=10):
    """Compresión SVD manual de textos"""
    # 1. Crear matriz término-frase (TF-IDF)
    vectorizer = TfidfVectorizer()
    tfidf = vectorizer.fit_transform(texts).toarray()

    # El total de componentes principales está determinado por el rango de la matriz
    total_components = min(tfidf.shape[0] - 1, tfidf.shape[1])
    print(f"Total de componentes posibles: {total_components}")

    # 2. Descomposición SVD manual
    U, s, Vt = np.linalg.svd(tfidf, full_matrices=False)
    
    # 3. Reducción a k componentes
    Uk = U[:, :k]
    sk = np.diag(s[:k])
    Vtk = Vt[:k, :]
    
    # 4. Reconstrucción aproximada
    tfidf_approx = Uk @ sk @ Vtk
    
    # 5. Mapear a vocabulario original
    vocabulary = vectorizer.get_feature_names_out()
    reconstructed_texts = []
    for row in tfidf_approx:
        top_words_idx = np.argsort(row)[-5:][::-1]  # Top 5 palabras por frase
        reconstructed_texts.append(" ".join([vocabulary[i] for i in top_words_idx]))
    
    return reconstructed_texts

def get_total_components(texts):
    vectorizer = TfidfVectorizer()
    tfidf = vectorizer.fit_transform(texts)
    return 

def process_json(input_path, output_path, k=10):
    with open(input_path) as f:
        data = json.load(f)
    
    compressed_data = svd_text_compress(data, k)
    
    with open(output_path, 'w') as f:
        json.dump({
            "metadata": {
                "method": "Manual-SVD",
                "components": k,
                "original_samples": len(data),
                "vocab_size": len(set(" ".join(data).split()))
            },
            "data": compressed_data
        }, f, indent=2)

k = 5
process_json('sentences.json', 'sentences_svd.json', k=k)

Total de componentes posibles: 99

import json
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from tabulate import tabulate

# Abrir fichero en modo lectura y de manera segura (cerrado automático al salir del with()) 
with open('sentences.json') as f:
    sentences = json.load(f)

# Calcular longitudes
lengths_chars = [len(sentence) for sentence in sentences]
lengths_words = [len(sentence.split()) for sentence in sentences]

# Función auxiliares
def get_stats(lengths):
    return {
        "Media": np.mean(lengths),
        "Mediana": np.median(lengths),
        "Moda": max(set(lengths), key=list(lengths).count), # La length (única) que más veces aparece en la lista original
        "P25": np.percentile(lengths, 25),
        "P75": np.percentile(lengths, 75),
        "Mínimo": min(lengths),
        "Máximo": max(lengths),
        "Desv. Estándar": np.std(lengths)
    }

stats_chars = get_stats(lengths_chars)
stats_words = get_stats(lengths_words)

# Tabla combinada
headers = ["Estadística", "Caracteres", "Palabras"]
table = [
    ["Media", f"{stats_chars['Media']:.1f}", f"{stats_words['Media']:.1f}"],
    ["Mediana", int(stats_chars["Mediana"]), int(stats_words["Mediana"])],
    ["Moda", stats_chars["Moda"], stats_words["Moda"]],
    ["Percentil 25", f"{stats_chars['P25']:.1f}", f"{stats_words['P25']:.1f}"],
    ["Percentil 75", f"{stats_chars['P75']:.1f}", f"{stats_words['P75']:.1f}"],
    ["Mínimo", stats_chars["Mínimo"], stats_words["Mínimo"]],
    ["Máximo", stats_chars["Máximo"], stats_words["Máximo"]],
    ["Desv. Estándar", f"{stats_chars['Desv. Estándar']:.1f}", f"{stats_words['Desv. Estándar']:.1f}"]
]

print("\nTabla comparativa de longitudes")
print(tabulate(table, headers=headers, tablefmt="grid", stralign="center"))

max_len_idx = np.argmax(lengths_words)
min_len_idx = np.argmin(lengths_words)

print("\n" + "-"*50)
print(f"Frase más larga ({lengths_words[max_len_idx]} palabras):")
print(f'"{sentences[max_len_idx]}"')

print("\n" + "-"*50)
print(f"Frase más corta ({lengths_words[min_len_idx]} palabras):")
print(f'"{sentences[min_len_idx]}"')

Tabla comparativa de longitudes
+----------------+--------------+------------+
|  Estadística   |   Caracteres |   Palabras |
+================+==============+============+
|     Media      |         68.1 |       11.5 |
+----------------+--------------+------------+
|    Mediana     |         66   |       11   |
+----------------+--------------+------------+
|      Moda      |         62   |       10   |
+----------------+--------------+------------+
|  Percentil 25  |         62   |       10   |
+----------------+--------------+------------+
|  Percentil 75  |         74.2 |       13   |
+----------------+--------------+------------+
|     Mínimo     |         52   |        8   |
+----------------+--------------+------------+
|     Máximo     |         90   |       18   |
+----------------+--------------+------------+
| Desv. Estándar |          8.8 |        2   |
+----------------+--------------+------------+

--------------------------------------------------
Frase más larga (18 palabras):
"The harp had an intricate wood carving, making it a piece of art as well as an instrument."

--------------------------------------------------
Frase más corta (8 palabras):
"The triangle was simple but required precise timing."

plt.figure(figsize=(15, 5))

# Histograma de palabras
plt.subplot(1, 2, 1)
plt.hist(lengths_words, bins=15, color='skyblue', edgecolor='black')
plt.title('Distribución de palabras por frase')
plt.xlabel('Número de palabras')
plt.ylabel('Frecuencia')

# Boxplot de caracteres
plt.subplot(1, 2, 2)
plt.boxplot(lengths_chars, vert=False, patch_artist=True, 
           boxprops=dict(facecolor='lightgreen'))
plt.title('Distribución de caracteres por frase')
plt.xlabel('Número de caracteres')

plt.tight_layout()
plt.show()

word_freq = Counter(" ".join(sentences).lower().split())
top_words = word_freq.most_common(30)

print("\nTop 30 palabras más frecuentes")
for i, (word, count) in enumerate(top_words, 1):
    print(f"{i}. {word}: {count} ocurrencias")

Top 30 palabras más frecuentes
1. the: 144 ocurrencias
2. a: 54 ocurrencias
3. was: 49 ocurrencias
4. of: 32 ocurrencias
5. it: 27 ocurrencias
6. in: 21 ocurrencias
7. to: 15 ocurrencias
8. and: 14 ocurrencias
9. its: 14 ocurrencias
10. with: 12 ocurrencias
11. that: 12 ocurrencias
12. so: 11 ocurrencias
13. had: 11 ocurrencias
14. for: 10 ocurrencias
15. making: 10 ocurrencias
16. an: 9 ocurrencias
17. could: 8 ocurrencias
18. were: 7 ocurrencias
19. into: 7 ocurrencias
20. like: 5 ocurrencias
21. but: 5 ocurrencias
22. seemed: 5 ocurrencias
23. small: 5 ocurrencias
24. as: 5 ocurrencias
25. on: 4 ocurrencias
26. plant: 4 ocurrencias
27. made: 4 ocurrencias
28. car: 4 ocurrencias
29. unique: 4 ocurrencias
30. required: 3 ocurrencias

from wordcloud import WordCloud
import matplotlib.pyplot as plt

wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freq)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Nube de Palabras - Frecuencias")
plt.show()

!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.8/12.8 MB 32.3 MB/s eta 0:00:0000:0100:01
✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')

import spacy

# Cargar modelo de lenguaje en español
nlp = spacy.load("en_core_web_sm")

# Preprocesamiento: lematización, eliminación de signos y stopwords
sentences_clean = []

for i, sent in enumerate(sentences[:]):
    doc = nlp(sent)
    tokens = [token.lemma_.lower() for token in doc if token.is_alpha and not token.is_stop]
    clean_sent = " ".join(tokens) # Juntar los tokens separándolos por un espacio
    sentences_clean.append(clean_sent) 
    if (i in range(5)): # Solo mostrar las 5 primeras
        print(f"Original {i + 1}: {sent}")
        print(f"Limpia   {i + 1}: {clean_sent}\n")

Original 1: The dragonfly's wings sparkled like gemstones in the sunlight.
Limpia   1: dragonfly wing sparkle like gemstone sunlight

Original 2: The bagpipes were adorned with a tartan pattern, honoring Scottish tradition.
Limpia   2: bagpipe adorn tartan pattern honor scottish tradition

Original 3: The fur pattern on the tiger was striking and intimidating.
Limpia   3: fur pattern tiger strike intimidating

Original 4: The triangle was simple but required precise timing.
Limpia   4: triangle simple require precise timing

Original 5: The sequoia tree was so tall it seemed to touch the sky.
Limpia   5: sequoia tree tall touch sky

from wordcloud import WordCloud
import matplotlib.pyplot as plt
word_freq = Counter(" ".join(sentences_clean).lower().split())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freq)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Nube de Palabras 2 - Frecuencias")
plt.show()

from sentence_transformers import SentenceTransformer

with open('sentences.json') as f:
    sentences = json.load(f)

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(sentences, show_progress_bar=True)
print(f"El dataset tienee {len(sentences)} frases. Se computaron {embeddings.shape[0]} embeddings de dimensión {embeddings.shape[1]}.")

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

El dataset tienee 100 frases. Se computaron 100 embeddings de dimensión 384.

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
# n_componentes indica a cuántas dimensiones se quieren reducir los datos 
# random_state fija una semilla para tener resultados reproducibles
# perplexity controla cuántos vecinos más cercanos considera para preservar la estructura en la medida de los posible al proyectar
tsne = TSNE(n_components=2, perplexity=15, random_state=15) 
embeddings_2d_tsne = tsne.fit_transform(embeddings)
plt.figure(figsize=(8,6))
plt.scatter(embeddings_2d_tsne[:, 0], embeddings_2d_tsne[:, 1]) 
for i, frase in enumerate(sentences[:10]):
    plt.annotate(frase, (embeddings_2d_tsne[i, 0], embeddings_2d_tsne[i, 1]))
plt.title("Visualización t-SNE de frases")
plt.xlabel("Dimensión 1")
plt.ylabel("Dimensión 2")
plt.show()

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
tsne = TSNE(n_components=3, perplexity=15, random_state=15)
embeddings_3d_tsne = tsne.fit_transform(embeddings)
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d') 
ax.scatter(embeddings_3d_tsne[:, 0], embeddings_3d_tsne[:, 1], embeddings_3d_tsne[:, 2])
for i, frase in enumerate(sentences[:5]):
    ax.text(embeddings_3d_tsne[i, 0], embeddings_3d_tsne[i, 1], embeddings_3d_tsne[i, 2], frase)
ax.set_title("Visualización t-SNE de frases (3D)")
ax.set_xlabel("Dimensión 1")
ax.set_ylabel("Dimensión 2")
ax.set_zlabel("Dimensión 3")
plt.show()

from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np

# Método del codo a gráfica de WCSS para encontrar el número óptimo de clusters
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
    kmeans.fit(embeddings_2d_tsne)
    wcss.append(kmeans.inertia_)
plt.figure(figsize=(8, 5))
plt.plot(range(1, 11), wcss, marker='o', linestyle='--')
plt.title('Método del codo con WCSS para k óptimo')
plt.xlabel('Número de clusters (k)')
plt.ylabel('WCSS (Within-Cluster Sum of Squares)')
plt.grid(True)
plt.show()

# Seleccionar K y entrenar k-mean
k = 4
kmeans = KMeans(n_clusters=k, init='k-means++', random_state=42)
clusters = kmeans.fit_predict(embeddings_2d_tsne) + 1

# Visualizar clusters y centroides
plt.figure(figsize=(10, 8))
scatter = plt.scatter(embeddings_2d_tsne[:, 0], embeddings_2d_tsne[:, 1],
                      c=clusters, cmap='viridis', alpha=0.7, s=50)
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1], 
            marker='X', s=200, c='red', label='Centroides')
plt.colorbar(scatter, label='Cluster')
plt.title(f'Clustering k-means (k={k}) en proyección t-SNE 2D')
plt.xlabel('Dimensión 1')
plt.ylabel('Dimensión 2')
plt.legend()
plt.grid(True)
plt.show()

# Mostrar frases por cluster
for cluster_id in range(1, k + 1):
    print(f"\nCluster {cluster_id}:")
    cluster_indices = np.where(clusters == cluster_id)[0]
    for idx in cluster_indices[:5]: # Mostrar 5 frases por cluster
        print(f"- {sentences[idx]}")

Cluster 1:
- The bagpipes were adorned with a tartan pattern, honoring Scottish tradition.
- The origami crane was minute but intricately folded.
- The didgeridoo produced a haunting sound that was deeply rooted in Aboriginal culture.
- The pickup truck had a toolbox and a rack for carrying lumber.
- The tuba was so large it required its own seat in the orchestra.

Cluster 2:
- The fur pattern on the tiger was striking and intimidating.
- The sequoia tree was so tall it seemed to touch the sky.
- The mimosa plant responded to touch by folding its leaves.
- The porcupine had quills that could detach when threatened.
- The dolphin's streamlined shape made it look graceful in water.

Cluster 3:
- The triangle was simple but required precise timing.
- The microchip was so small it could be embedded under the skin.
- The virtual reality headset offered an immersive experience.
- The length of the train seemed to stretch into the horizon.
- The sitar's complex architecture made it visually fascinating.

Cluster 4:
- The dragonfly's wings sparkled like gemstones in the sunlight.
- The dandelion seed head formed a delicate sphere, ready to be carried away by the wind.
- The petals of the rose were arranged in perfect symmetry.
- The quilt was a kaleidoscope of colors, each patch telling its own story.
- The aloe vera plant was known for its medicinal qualities.

# Precomputar embeddings
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.notebook import tqdm  

model = SentenceTransformer('all-MiniLM-L6-v2')
sentence_embeddings = []
for sentence in tqdm(sentences, desc="Codificando frases"):
    embedding = model.encode(sentence)
    sentence_embeddings.append(embedding)
sentence_embeddings = np.array(sentence_embeddings)

Codificando frases:   0%|          | 0/100 [00:00<?, ?it/s]

# Testear
query = "prompt ocurrente" # Prompt del usuario
query_embedding = model.encode([query])

similarities = []
for emb in tqdm(sentence_embeddings, desc="Calculando similitudes"):
    sim = cosine_similarity(query_embedding.reshape(1, -1), emb.reshape(1, -1))[0][0] # reshape() para formar matrices (1, n) con los vectores 1D
    similarities.append(sim)
similarities = np.array(similarities)

top_n = 5
indices = similarities.argsort()[::-1][:top_n]
print(f"\nFrases más relacionadas con: '{query}'\n")
for idx in indices:
    print(f"- {sentences[idx]}  (similitud: {similarities[idx]:.2f})")

Calculando similitudes:   0%|          | 0/100 [00:00<?, ?it/s]

Frases más relacionadas con: 'prompt ocurrente'

- The school bus flaunted the iconic bright yellow, capturing attention immediately.  (similitud: 0.20)
- The porcupine had quills that could detach when threatened.  (similitud: 0.20)
- The fashion accessory was adorned with complex beadwork, making it a statement piece.  (similitud: 0.18)
- The platypus appeared bizarre with its duck-like bill and beaver-like tail.  (similitud: 0.17)
- The didgeridoo produced a haunting sound that was deeply rooted in Aboriginal culture.  (similitud: 0.17)

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import spacy

nlp = spacy.load("es_core_news_sm")

def split_text_into_sentences(text):
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
    return [s.strip() for s in sentences if len(s.strip()) > 0]

def lemmatize_sentences(sentences):
    sentences_clean = []
    for sent in sentences:
        doc = nlp(sent)
        tokens = [token.lemma_.lower() for token in doc if token.is_alpha and not token.is_stop]
        clean_sent = " ".join(tokens)
        sentences_clean.append(clean_sent)
    return sentences_clean

def svd_reconstruct_text(text, k=5, top_words=5):
    sentences = split_text_into_sentences(text)
    clean_sentences = lemmatize_sentences(sentences)

    vectorizer = TfidfVectorizer()
    tfidf = vectorizer.fit_transform(clean_sentences).toarray()

    U, s, Vt = np.linalg.svd(tfidf, full_matrices=False)

    Uk = U[:, :k]
    sk = np.diag(s[:k])
    Vtk = Vt[:k, :]

    tfidf_approx = Uk @ sk @ Vtk

    vocabulary = vectorizer.get_feature_names_out()
    reconstructed_sentences = []

    for row in tfidf_approx:
        top_indices = np.argsort(row)[-top_words:][::-1]
        words = [vocabulary[i] for i in top_indices if row[i] > 0]
        reconstructed_sentences.append(" ".join(words))

    return reconstructed_sentences

def remove_duplicates_preserve_order(words):
    seen = set()
    unique_words = []
    for w in words:
        if w not in seen:
            unique_words.append(w)
            seen.add(w)
    return unique_words

def process_text_svd_reconstruction(input_path='texto.txt', output_path='text_resumed.txt', k=5, top_words=3):
    with open(input_path, 'r', encoding='utf-8') as f:
        text = f.read()

    reconstructed = svd_reconstruct_text(text, k=k, top_words=top_words)

    # Unir todas las frases reconstruidas en una lista de palabras
    all_words = " ".join(reconstructed).split()

    # Eliminar palabras repetidas manteniendo el orden
    unique_words = remove_duplicates_preserve_order(all_words)

    reconstructed_text = " ".join(unique_words)

    print("Resumen del texto: \n")
    print(reconstructed_text)

    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(reconstructed_text)

process_text_svd_reconstruction('texto.txt', 'text_resumed.txt', k=5, top_words=5)

Resumen del texto: 

real implementación algoritmo plan trabajo ruido tesis técnica nube punto detalle método geométrico desarrollo preservación filtro artículo análisis problema resultado limitación planteamiento presencia aplicación sintético modelo dato dependencia reducir revisión grafo basado redacción actual validación métrica entorno supervisada

Operación	Imágenes	Texto
Entrada	Matriz de píxeles	Matriz TF-IDF
Salida	Imagen aproximada	Términos más relevantes
Pérdida (de)	Detalles finos (e.g. bordes, gradientes altos)	Palabras menos importantes (semánticamente)

notebook.ipynb¶

Instalación¶

Declaración de autoría¶

Dataset¶

Compresión de las sentencias¶

Reducción de la dimensionalidad¶

Enfoque alternativo: partir de matriz TF-IDF¶

Análisis (estadístico) exploratorio del dataset¶

Longitud de frase¶

Histograma de longitudes¶

Análisis de frecuencia de palabras¶

Nube de palabras¶

Análisis de patrones sintácticos¶

Preprocesamiento¶

Análisis semántico¶

Reducción de dimensionalidad¶

Clustering¶

Búsqueda semántica¶

Resumen de texto¶

Conclusiones¶

Referencias¶