Reading time: 1 minutes

RSS įrašų išgavimas, jų vertimas ir santraukų generavimas naudojant Python

Sun, Feb 18, 2024

RSS įrašų išgavimas, jų vertimas ir santraukų generavimas naudojant Python programavimo kalbą

Pavyzdyje naudojamos feedparser, BeautifulSoup, nltk ir googletrans (Google-Translation) bibliotekos.

import feedparser

from bs4 import BeautifulSoup

from nltk.tokenize import sent_tokenize

from nltk.corpus import stopwords

from heapq import nlargest

from googletrans import Translator

import nltk

nltk.download('punkt')

import time

nltk.download('stopwords')

def translate_text(text):

"""Translate text to English."""

text = str(text)

translator = Translator()

try:

translation = translator.translate(text, dest='en')

time.sleep(2)

translation = translation.text

except:

return None

return translation

  

def summarize_html(html_content, num_sentences=2):

"""Summarize HTML content."""

soup = BeautifulSoup(html_content, 'html.parser')

text = soup.get_text()

text = translate_text(text)

if(text is None):

return None

sentences = sent_tokenize(text)

stop_words = set(stopwords.words('english'))

word_frequencies = {}

for word in text.split():

word = word.lower()

if word not in stop_words:

if word not in word_frequencies:

word_frequencies[word] = 1

else:

word_frequencies[word] += 1

  

maximum_frequency = max(word_frequencies.values())

for word in word_frequencies.keys():

word_frequencies[word] = word_frequencies[word] / maximum_frequency

  

sentence_scores = {}

for sentence in sentences:

for word in sentence.split():

if word.lower() in word_frequencies:

if sentence not in sentence_scores:

sentence_scores[sentence] = word_frequencies[word.lower()]

else:

sentence_scores[sentence] += word_frequencies[word.lower()]

  

summarized_sentences = nlargest(num_sentences, sentence_scores, key=sentence_scores.get)

  

summary = ' '.join(summarized_sentences)

return summary

  

def translate_and_summarize_article(url):

"""Translate article to English and summarize it."""

translated_entries = []

feed = feedparser.parse(url)

for x,entry in enumerate(feed.entries):

print(x)

translated_entry = {}

translated_entry['title'] = translate_text(entry.title)

summary = summarize_html(entry.description)

translated_entry['summary'] = summary

if(summary is not None):

translated_entries.append(translated_entry)

return translated_entries

  

def read_and_summarize_rss_feeds():

rss_feed = str(input("\n\nType in your rss link\n\n"))

if(rss_feed.lower()=="exit" or rss_feed.lower()=="end" or rss_feed.lower()=="quit"):

quit()

try:

feed_urls = rss_feed.split(",")

except:

feed_urls = [rss_feed]

"""Read and summarize multiple RSS feeds."""

for url in feed_urls:

print(f"Reading RSS feed from: {url}\n")

translated_entries = translate_and_summarize_article(url)

for entry in translated_entries:

print(f"Title: {entry['title']}\n")

print(f"Summary: {entry['summary']}\n")

read_and_summarize_rss_feeds()

  

try:

while True:

read_and_summarize_rss_feeds()

pass

except KeyboardInterrupt:

print("\n\nCtrl+C detected. Exiting gracefully...\n\n")

quit()

Inner Tags: #newpost #python #rss #santraukos #vertimas

Internal links if exist:

External links if exist:

Markdown external:

Visa pateikiama informacija - asmeninė autoriaus nuomonė. Kilus naiškumams rekomenduojama susisiekti elektroniniu paštu: admin@artefaktas.eu

Artefaktas.eu is licensed under CC BY-NC-ND 4.0

RSS įrašų išgavimas, jų vertimas ir santraukų generavimas naudojant Python

RSS įrašų išgavimas, jų vertimas ir santraukų generavimas naudojant Python programavimo kalbą

Comments