Reading time: 1 minutes
RSS įrašų išgavimas, jų vertimas ir santraukų generavimas naudojant Python
RSS įrašų išgavimas, jų vertimas ir santraukų generavimas naudojant Python programavimo kalbą
Pavyzdyje naudojamos feedparser, BeautifulSoup, nltk ir googletrans (Google-Translation) bibliotekos.
import feedparser
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from heapq import nlargest
from googletrans import Translator
import nltk
nltk.download('punkt')
import time
nltk.download('stopwords')
def translate_text(text):
"""Translate text to English."""
text = str(text)
translator = Translator()
try:
translation = translator.translate(text, dest='en')
time.sleep(2)
translation = translation.text
except:
return None
return translation
def summarize_html(html_content, num_sentences=2):
"""Summarize HTML content."""
soup = BeautifulSoup(html_content, 'html.parser')
text = soup.get_text()
text = translate_text(text)
if(text is None):
return None
sentences = sent_tokenize(text)
stop_words = set(stopwords.words('english'))
word_frequencies = {}
for word in text.split():
word = word.lower()
if word not in stop_words:
if word not in word_frequencies:
word_frequencies[word] = 1
else:
word_frequencies[word] += 1
maximum_frequency = max(word_frequencies.values())
for word in word_frequencies.keys():
word_frequencies[word] = word_frequencies[word] / maximum_frequency
sentence_scores = {}
for sentence in sentences:
for word in sentence.split():
if word.lower() in word_frequencies:
if sentence not in sentence_scores:
sentence_scores[sentence] = word_frequencies[word.lower()]
else:
sentence_scores[sentence] += word_frequencies[word.lower()]
summarized_sentences = nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
summary = ' '.join(summarized_sentences)
return summary
def translate_and_summarize_article(url):
"""Translate article to English and summarize it."""
translated_entries = []
feed = feedparser.parse(url)
for x,entry in enumerate(feed.entries):
print(x)
translated_entry = {}
translated_entry['title'] = translate_text(entry.title)
summary = summarize_html(entry.description)
translated_entry['summary'] = summary
if(summary is not None):
translated_entries.append(translated_entry)
return translated_entries
def read_and_summarize_rss_feeds():
rss_feed = str(input("\n\nType in your rss link\n\n"))
if(rss_feed.lower()=="exit" or rss_feed.lower()=="end" or rss_feed.lower()=="quit"):
quit()
try:
feed_urls = rss_feed.split(",")
except:
feed_urls = [rss_feed]
"""Read and summarize multiple RSS feeds."""
for url in feed_urls:
print(f"Reading RSS feed from: {url}\n")
translated_entries = translate_and_summarize_article(url)
for entry in translated_entries:
print(f"Title: {entry['title']}\n")
print(f"Summary: {entry['summary']}\n")
read_and_summarize_rss_feeds()
try:
while True:
read_and_summarize_rss_feeds()
pass
except KeyboardInterrupt:
print("\n\nCtrl+C detected. Exiting gracefully...\n\n")
quit()
Inner Tags: #newpost #python #rss #santraukos #vertimas
Internal links if exist:
External links if exist:
Markdown external:
Visa pateikiama informacija - asmeninė autoriaus nuomonė. Kilus naiškumams rekomenduojama susisiekti elektroniniu paštu: admin@artefaktas.eu
Artefaktas.eu is licensed under CC BY-NC-ND 4.0