##// END OF EJS Templates
Strip html tags from the fetched sources description
neko259 -
r1977:7b889029 default
parent child Browse files
Show More
@@ -1,74 +1,77 b''
1 import feedparser
1 import feedparser
2 import logging
2 import logging
3 import calendar
3 import calendar
4
4
5 from time import mktime
5 from time import mktime
6 from datetime import datetime
6 from datetime import datetime
7
7
8 from django.db import models, transaction
8 from django.db import models, transaction
9 from django.utils.dateparse import parse_datetime
9 from django.utils.dateparse import parse_datetime
10 from django.utils.timezone import utc
10 from django.utils.timezone import utc
11 from django.utils import timezone
11 from django.utils import timezone
12 from django.utils.html import strip_tags
12
13
13 from boards.models import Post
14 from boards.models import Post
14 from boards.models.post import TITLE_MAX_LENGTH
15 from boards.models.post import TITLE_MAX_LENGTH
15 from boards.utils import get_tripcode_from_text
16 from boards.utils import get_tripcode_from_text
16 from boards import settings
17 from boards import settings
17
18
18
19
19 SOURCE_TYPE_MAX_LENGTH = 100
20 SOURCE_TYPE_MAX_LENGTH = 100
20 SOURCE_TYPE_RSS = 'RSS'
21 SOURCE_TYPE_RSS = 'RSS'
21 TYPE_CHOICES = (
22 TYPE_CHOICES = (
22 (SOURCE_TYPE_RSS, SOURCE_TYPE_RSS),
23 (SOURCE_TYPE_RSS, SOURCE_TYPE_RSS),
23 )
24 )
24
25
25
26
26 class ThreadSource(models.Model):
27 class ThreadSource(models.Model):
27 class Meta:
28 class Meta:
28 app_label = 'boards'
29 app_label = 'boards'
29
30
30 name = models.TextField()
31 name = models.TextField()
31 thread = models.ForeignKey('Thread')
32 thread = models.ForeignKey('Thread')
32 timestamp = models.DateTimeField()
33 timestamp = models.DateTimeField()
33 source = models.TextField()
34 source = models.TextField()
34 source_type = models.CharField(max_length=SOURCE_TYPE_MAX_LENGTH,
35 source_type = models.CharField(max_length=SOURCE_TYPE_MAX_LENGTH,
35 choices=TYPE_CHOICES)
36 choices=TYPE_CHOICES)
36
37
37 def __str__(self):
38 def __str__(self):
38 return self.name
39 return self.name
39
40
40 @transaction.atomic
41 @transaction.atomic
41 def fetch_latest_posts(self):
42 def fetch_latest_posts(self):
42 """Creates new posts with the info fetched since the timestamp."""
43 """Creates new posts with the info fetched since the timestamp."""
43 logger = logging.getLogger('boards.source')
44 logger = logging.getLogger('boards.source')
44
45
45 if self.thread.is_archived():
46 if self.thread.is_archived():
46 logger.error('The thread {} is archived, please try another one'.format(self.thread))
47 logger.error('The thread {} is archived, please try another one'.format(self.thread))
47 else:
48 else:
48 tripcode = get_tripcode_from_text(
49 tripcode = get_tripcode_from_text(
49 settings.get('External', 'SourceFetcherTripcode'))
50 settings.get('External', 'SourceFetcherTripcode'))
50 start_timestamp = self.timestamp
51 start_timestamp = self.timestamp
51 last_timestamp = start_timestamp
52 last_timestamp = start_timestamp
52 logger.info('Start timestamp is {}'.format(start_timestamp))
53 logger.info('Start timestamp is {}'.format(start_timestamp))
53 if self.thread.is_bumplimit():
54 if self.thread.is_bumplimit():
54 logger.warn('The thread {} has reached its bumplimit, please create a new one'.format(self.thread))
55 logger.warn('The thread {} has reached its bumplimit, please create a new one'.format(self.thread))
55 if self.source_type == SOURCE_TYPE_RSS:
56 if self.source_type == SOURCE_TYPE_RSS:
56 feed = feedparser.parse(self.source)
57 feed = feedparser.parse(self.source)
57 items = sorted(feed.entries, key=lambda entry: entry.published_parsed)
58 items = sorted(feed.entries, key=lambda entry: entry.published_parsed)
58 for item in items:
59 for item in items:
59 title = item.title[:TITLE_MAX_LENGTH]
60 title = item.title[:TITLE_MAX_LENGTH]
60 timestamp = datetime.fromtimestamp(calendar.timegm(item.published_parsed), tz=utc)
61 timestamp = datetime.fromtimestamp(calendar.timegm(item.published_parsed), tz=utc)
61 if not timestamp:
62 if not timestamp:
62 logger.error('Invalid timestamp {} for {}'.format(item.published, title))
63 logger.error('Invalid timestamp {} for {}'.format(item.published, title))
63 else:
64 else:
64 if timestamp > last_timestamp:
65 if timestamp > last_timestamp:
65 last_timestamp = timestamp
66 last_timestamp = timestamp
66 if timestamp > start_timestamp:
67 if timestamp > start_timestamp:
67 Post.objects.create_post(title=title, text=item.description,
68 Post.objects.create_post(title=title, text=self.parse_text(item.description),
68 thread=self.thread, file_urls=[item.link], tripcode=tripcode)
69 thread=self.thread, file_urls=[item.link], tripcode=tripcode)
69 logger.info('Fetched item {} from {} into thread {}'.format(
70 logger.info('Fetched item {} from {} into thread {}'.format(
70 title, self.name, self.thread))
71 title, self.name, self.thread))
71 logger.info('New timestamp is {}'.format(last_timestamp))
72 logger.info('New timestamp is {}'.format(last_timestamp))
72 self.timestamp = last_timestamp
73 self.timestamp = last_timestamp
73 self.save(update_fields=['timestamp'])
74 self.save(update_fields=['timestamp'])
74
75
76 def parse_text(self, text):
77 return strip_tags(text)
General Comments 0
You need to be logged in to leave comments. Login now