Reading time: 1 minutes

RSS įrašų išgavimas, jų vertimas ir santraukų generavimas naudojant Python



new_post python rss santraukos vertimas



RSS įrašų išgavimas, jų vertimas ir santraukų generavimas naudojant Python programavimo kalbą

Pavyzdyje naudojamos feedparser, BeautifulSoup, nltk ir googletrans (Google-Translation) bibliotekos.

import feedparser

from bs4 import BeautifulSoup

from nltk.tokenize import sent_tokenize

from nltk.corpus import stopwords

from heapq import nlargest

from googletrans import Translator

import nltk

nltk.download('punkt')

import time

nltk.download('stopwords')

def translate_text(text):

"""Translate text to English."""

text = str(text)

translator = Translator()

try:

translation = translator.translate(text, dest='en')

time.sleep(2)

translation = translation.text

except:

return None

return translation

  

def summarize_html(html_content, num_sentences=2):

"""Summarize HTML content."""

soup = BeautifulSoup(html_content, 'html.parser')

text = soup.get_text()

text = translate_text(text)

if(text is None):

return None

sentences = sent_tokenize(text)

stop_words = set(stopwords.words('english'))

word_frequencies = {}

for word in text.split():

word = word.lower()

if word not in stop_words:

if word not in word_frequencies:

word_frequencies[word] = 1

else:

word_frequencies[word] += 1

  

maximum_frequency = max(word_frequencies.values())

for word in word_frequencies.keys():

word_frequencies[word] = word_frequencies[word] / maximum_frequency

  

sentence_scores = {}

for sentence in sentences:

for word in sentence.split():

if word.lower() in word_frequencies:

if sentence not in sentence_scores:

sentence_scores[sentence] = word_frequencies[word.lower()]

else:

sentence_scores[sentence] += word_frequencies[word.lower()]

  

summarized_sentences = nlargest(num_sentences, sentence_scores, key=sentence_scores.get)

  

summary = ' '.join(summarized_sentences)

return summary

  

def translate_and_summarize_article(url):

"""Translate article to English and summarize it."""

translated_entries = []

feed = feedparser.parse(url)

for x,entry in enumerate(feed.entries):

print(x)

translated_entry = {}

translated_entry['title'] = translate_text(entry.title)

summary = summarize_html(entry.description)

translated_entry['summary'] = summary

if(summary is not None):

translated_entries.append(translated_entry)

return translated_entries

  

def read_and_summarize_rss_feeds():

rss_feed = str(input("\n\nType in your rss link\n\n"))

if(rss_feed.lower()=="exit" or rss_feed.lower()=="end" or rss_feed.lower()=="quit"):

quit()

try:

feed_urls = rss_feed.split(",")

except:

feed_urls = [rss_feed]

"""Read and summarize multiple RSS feeds."""

for url in feed_urls:

print(f"Reading RSS feed from: {url}\n")

translated_entries = translate_and_summarize_article(url)

for entry in translated_entries:

print(f"Title: {entry['title']}\n")

print(f"Summary: {entry['summary']}\n")

read_and_summarize_rss_feeds()

  

try:

while True:

read_and_summarize_rss_feeds()

pass

except KeyboardInterrupt:

print("\n\nCtrl+C detected. Exiting gracefully...\n\n")

quit()

Inner Tags: #newpost #python #rss #santraukos #vertimas

Internal links if exist:

External links if exist:

Markdown external:

Visa pateikiama informacija - asmeninė autoriaus nuomonė. Kilus naiškumams rekomenduojama susisiekti elektroniniu paštu: admin@artefaktas.eu

Comments

comments powered by Disqus