Skip to content

Commit

Permalink
Merge pull request #64 from j0k3r/pdf-utf8
Browse files Browse the repository at this point in the history
Extract stuff from command and move it to a service
  • Loading branch information
j0k3r committed Dec 17, 2015
2 parents ca97720 + 000dd43 commit 4c50896
Show file tree
Hide file tree
Showing 21 changed files with 271 additions and 195 deletions.
10 changes: 10 additions & 0 deletions app/config/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,13 @@ stof_doctrine_extensions:
default:
timestampable: true
sluggable: true

monolog:
channels: ['import']
handlers:
console:
type: console
verbosity_levels:
VERBOSITY_NORMAL: DEBUG
channels: import
formatter: monolog.import.formatter
142 changes: 3 additions & 139 deletions src/FeedBundle/Command/FetchItemsCommand.php
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,6 @@
use Symfony\Component\Console\Input\InputOption;
use Symfony\Component\Console\Output\OutputInterface;
use Symfony\Component\Filesystem\LockHandler;
use Symfony\Component\Console\Helper\ProgressBar;
use Api43\FeedBundle\Document\Feed;
use Api43\FeedBundle\Document\FeedItem;
use Api43\FeedBundle\Document\FeedLog;
use Api43\FeedBundle\Event\FeedItemEvent;
use Api43\FeedBundle\Api43FeedEvents;

class FetchItemsCommand extends ContainerAwareCommand
{
Expand Down Expand Up @@ -74,140 +68,10 @@ protected function execute(InputInterface $input, OutputInterface $output)
$output->writeln('<info>Feeds to check</info>: '.count($feeds));
}

$totalCached = 0;
$feedUpdated = array();

foreach ($feeds as $feed) {
if ($output->isVerbose()) {
$output->writeln('<info>Working on</info>: '.$feed->getName().' (parser: <comment>'.$feed->getParser().'</comment>)');
}

$rssFeed = $container
->get('simple_pie_proxy')
->setUrl($feed->getLink())
->init();

// update feed description, in case it was empty
if (0 === strlen($feed->getDescription()) && 0 !== strlen($rssFeed->get_description())) {
$feed->setDescription(html_entity_decode($rssFeed->get_description(), ENT_COMPAT, 'UTF-8'));
$dm->persist($feed);
$dm->flush($feed);
}

$parser = $container
->get('content_extractor')
->init($feed->getParser(), $feed, true);

$cachedLinks = $feedItemRepo->getAllLinks($feed->getId());
$cached = 0;

// show progress bar in trace mode only
if ($output->isVerbose()) {
$total = $rssFeed->get_item_quantity();
$progress = new ProgressBar($output, $total);
$progress->start();
}

foreach ($rssFeed->get_items() as $item) {
if ($output->isVerbose()) {
$progress->advance();
}

// if an item already exists, we skip it
// or if the item doesn't have a link, we won't cache it - will be useless
if (isset($cachedLinks[$item->get_permalink()]) || null === $item->get_permalink()) {
continue;
}

$parsedContent = $parser->parseContent(
$item->get_permalink(),
$item->get_description()
);

// if readable content failed, use default one from feed item
$content = $parsedContent->content;
if (false === $content) {
$content = $item->get_content();
}

// if there is no date in the feed, we use the current one
$date = $item->get_date();
if (null === $date) {
$date = date('j F Y, g:i:s a');
}

$feedItem = new FeedItem();
$feedItem->setTitle(html_entity_decode($item->get_title(), ENT_COMPAT, 'UTF-8'));
$feedItem->setLink($parsedContent->url);
$feedItem->setContent($content);
$feedItem->setPermalink($item->get_permalink());
$feedItem->setPublishedAt($date);
$feedItem->setFeed($feed);
$dm->persist($feedItem);

++$cached;
}

if ($output->isVerbose()) {
$progress->finish();
$output->writeln('');
}

if ($cached) {
// save the last time items where updated
$feed->setLastItemCachedAt(date('j F Y, g:i:s a'));
$dm->persist($feed);

$totalCached += $cached;

$feedLog = new FeedLog();
$feedLog->setItemsNumber($cached);
$feedLog->setFeed($feed);

$dm->persist($feedLog);

// store feed url updated, to ping hub later
$feedUpdated[] = $feed->getSlug();
}

if ($output->isVerbose()) {
$output->writeln('<info>New cached items</info>: '.$cached);
}

$dm->flush();
}

if (!empty($feedUpdated)) {
if ($output->isVerbose()) {
$output->writeln('<info>Ping hubs...</info>');
}

// send an event about new feed updated
$event = new FeedItemEvent($feedUpdated);

$container->get('event_dispatcher')->dispatch(
Api43FeedEvents::AFTER_ITEM_CACHED,
$event
);
}
// let's import some stuff !
$import = $container->get('content_import');
$totalCached = $import->process($feeds);

$output->writeLn('<comment>'.$totalCached.'</comment> items cached.');

// update nb items for each udpated feed
foreach ($feedUpdated as $slug) {
$feed = $feedRepo->findOneByslug($slug);

$nbItems = $feedItemRepo->countByFeedId($feed->getId());

$feed->setNbItems($nbItems);
$dm->persist($feed);

if ($output->isVerbose()) {
$output->writeln('<info>'.$feed->getName().'</info> items updated: <comment>'.$nbItems.'</comment>');
}
}

$dm->flush();
$dm->clear();
}
}
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
<?php

namespace Api43\FeedBundle\Services;
namespace Api43\FeedBundle\Content;

use Api43\FeedBundle\Document\Feed;
use Api43\FeedBundle\Extractor\ExtractorChain;
use Api43\FeedBundle\Improver\ImproverChain;
use Api43\FeedBundle\Parser\ParserChain;

class ContentExtractor
class Extractor
{
protected $feed = null;
protected $extractorChain;
Expand Down Expand Up @@ -42,7 +42,7 @@ public function __construct(ExtractorChain $extractorChain, ImproverChain $impro
* @param bool $allowAllParser Define if we have to use all *known* parser to get the content if the defined one failed.
* For example, Internal parser can't make content readable, it will use the External one, etc ..
*
* @return ContentExtractor Current object
* @return Extractor Current object
*/
public function init($chosenParser, Feed $feed = null, $allowAllParser = false)
{
Expand All @@ -63,7 +63,7 @@ public function init($chosenParser, Feed $feed = null, $allowAllParser = false)
* @param string $url RSS item url
* @param string|null $itemContent RSS item content, which will be taken if we can't extract content from url
*
* @return ContentExtractor
* @return Extractor
*/
public function parseContent($url, $itemContent = null)
{
Expand Down
163 changes: 163 additions & 0 deletions src/FeedBundle/Content/Import.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
<?php

namespace Api43\FeedBundle\Content;

use Symfony\Component\EventDispatcher\EventDispatcherInterface;
use Doctrine\ODM\MongoDB\DocumentManager;
use Psr\Log\LoggerInterface;
use Api43\FeedBundle\Xml\SimplePieProxy;
use Api43\FeedBundle\Document\FeedItem;
use Api43\FeedBundle\Document\FeedLog;
use Api43\FeedBundle\Event\FeedItemEvent;
use Api43\FeedBundle\Api43FeedEvents;

class Import
{
private $logger;
private $simplePieProxy;
private $extractor;
private $eventDispatcher;
private $dm;

public function __construct(SimplePieProxy $simplePieProxy, Extractor $extractor, EventDispatcherInterface $eventDispatcher, DocumentManager $dm, LoggerInterface $logger)
{
$this->simplePieProxy = $simplePieProxy;
$this->extractor = $extractor;
$this->eventDispatcher = $eventDispatcher;
$this->dm = $dm;
$this->logger = $logger;
}

/**
* Process feeds in parameter:
* - fetch xml feed
* - retrieve all links inside it
* - extract content
* - create a FeedItem with all information
* - a FeedLog with all item cached
* - if there are new content, dispatch event to ping hub
* - finally, update total item counter.
*
* @param array $feeds An array for Api43\FeedBundle\Document\Feed or an Doctrine\ODM\MongoDB\EagerCursor
*/
public function process($feeds)
{
$totalCached = 0;
$feedUpdated = array();
$feedRepo = $this->dm->getRepository('Api43FeedBundle:Feed');
$feedItemRepo = $this->dm->getRepository('Api43FeedBundle:FeedItem');

foreach ($feeds as $feed) {
$this->logger->debug('<info>Working on</info>: '.$feed->getName().' (parser: <comment>'.$feed->getParser().'</comment>)');

$rssFeed = $this
->simplePieProxy
->setUrl($feed->getLink())
->init();

// update feed description, in case it was empty
if (0 === strlen($feed->getDescription()) && 0 !== strlen($rssFeed->get_description())) {
$feed->setDescription(html_entity_decode($rssFeed->get_description(), ENT_COMPAT, 'UTF-8'));
$this->dm->persist($feed);
$this->dm->flush($feed);
}

$parser = $this
->extractor
->init($feed->getParser(), $feed, true);

$cachedLinks = $feedItemRepo->getAllLinks($feed->getId());
$cached = 0;

$this->logger->debug('<info>Link to check</info>: <comment>'.$rssFeed->get_item_quantity().'</comment>');

foreach ($rssFeed->get_items() as $item) {
// if an item already exists, we skip it
// or if the item doesn't have a link, we won't cache it - will be useless
if (isset($cachedLinks[$item->get_permalink()]) || null === $item->get_permalink()) {
continue;
}

$this->logger->debug(' <info>Parse content for url</info>: <comment>'.$item->get_permalink().'</comment>');

$parsedContent = $parser->parseContent(
$item->get_permalink(),
$item->get_description()
);

// if readable content failed, use default one from feed item
$content = $parsedContent->content;
if (false === $content) {
$content = $item->get_content();
}

// if there is no date in the feed, we use the current one
$date = $item->get_date();
if (null === $date) {
$date = date('j F Y, g:i:s a');
}

$feedItem = new FeedItem();
$feedItem->setTitle(html_entity_decode($item->get_title(), ENT_COMPAT, 'UTF-8'));
$feedItem->setLink($parsedContent->url);
$feedItem->setContent($content);
$feedItem->setPermalink($item->get_permalink());
$feedItem->setPublishedAt($date);
$feedItem->setFeed($feed);
$this->dm->persist($feedItem);

++$cached;
}

if ($cached) {
// save the last time items where updated
$feed->setLastItemCachedAt(date('j F Y, g:i:s a'));
$this->dm->persist($feed);

$totalCached += $cached;

$feedLog = new FeedLog();
$feedLog->setItemsNumber($cached);
$feedLog->setFeed($feed);

$this->dm->persist($feedLog);

// store feed url updated, to ping hub later
$feedUpdated[] = $feed->getSlug();
}

$this->logger->debug('<info>New cached items</info>: '.$cached);

$this->dm->flush();
}

if (!empty($feedUpdated)) {
$this->logger->debug('<info>Ping hubs...</info>');

// send an event about new feed updated
$event = new FeedItemEvent($feedUpdated);

$this->eventDispatcher->dispatch(
Api43FeedEvents::AFTER_ITEM_CACHED,
$event
);
}

// update nb items for each udpated feed
foreach ($feedUpdated as $slug) {
$feed = $feedRepo->findOneByslug($slug);

$nbItems = $feedItemRepo->countByFeedId($feed->getId());

$feed->setNbItems($nbItems);
$this->dm->persist($feed);

$this->logger->debug('<info>'.$feed->getName().'</info> items updated: <comment>'.$nbItems.'</comment>');
}

$this->dm->flush();
$this->dm->clear();

return $totalCached;
}
}
2 changes: 1 addition & 1 deletion src/FeedBundle/Controller/FeedController.php
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ private function createDeleteForm()
public function xmlAction(Feed $feed)
{
return new Response(
$this->get('rss_render')->render($feed),
$this->get('xml_render')->doRender($feed),
200,
array('Content-Type' => 'text/xml')
);
Expand Down
2 changes: 1 addition & 1 deletion src/FeedBundle/Controller/FeedTestController.php
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class FeedTestController extends Controller
* - chose the best parser
* - test a site configuration.
*
* @return array
* @return \Symfony\Component\HttpFoundation\Response
*/
public function indexAction(Request $request)
{
Expand Down
Loading

0 comments on commit 4c50896

Please sign in to comment.