All scripts in one repository

This commit is contained in:
mmk2410 2015-11-12 22:36:23 +01:00
parent a0a86492ee
commit 9f9304d6aa
61 changed files with 6668 additions and 681 deletions

22
blogger2rangitaki/LICENSE Normal file
View file

@ -0,0 +1,22 @@
COPYRIGHT (c) 2015 mmk2410
MIT License
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

View file

@ -0,0 +1,31 @@
# blogger2rangitaki
This is a small PHP script for converting a Blogger XML to Rangitaki blog posts.
This script uses [html-to-markdown](https://github.com/thephpleague/html-to-markdown) to convert the blogposts.
## Usage
You don't need to install that script on your computer. It is enough to make it runnable:
```
chmod +x blogger2rangitaki.php
```
And to run it:
```
./blogger2rangitaki.php blog.xml
```
where `blog.xml` is your Blogger XML file (the exported blog).
**This script doesn't import your media files into Rangitaki.**
## HHVM
This script works also in HHVM. Just replace the first line with
```
#!/bin/hhvm
```

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,106 @@
#!/bin/php
<?php
// This is a php script for converting a blogger atom feed into rangitaki blog posts
require './vendor/autoload.php';
use League\HTMLToMarkdown\HtmlConverter;
if(in_array($argv[1], array("-h", "--help", "--usage", "-?"))) {
help();
} else if (isset($argv[1])) {
$content = file_get_contents("$argv[1]");
$xml = new SimpleXMLElement($content);
$converter = new HtmlConverter(array('strip_tags' => true));
$i = 0;
foreach ($xml->entry as $entry) {
if($i > 56) {
// TITLE
$title = $entry->title;
// CONTENT
$content = $entry->content;
$content = $converter->convert($content);
// AUTHOR
$author = $entry->author->name;
// TAGS
if (isset($entry->categories)) {
echo "YES!";
foreach ($entry->categories->attributes as $tag) {
if (!(substr_compare($tag->scheme, "http://schemas.google.com/", 0, 26))) {
$tags = $tags . $tag->term . ", ";
}
$tags = substr($tags, 0, strlen($tags) - 2);
}
}
// Pubdate
$pubdate = $entry->published;
date_default_timezone_set("UTC");
$pubdate = date("d F Y", strtotime($pubdate));
// FILENAME
$date = $entry->published;
$date = date("Y-m-d-H-i", strtotime($date));
$filetitle = str_replace(" ", "-", $title);
$filename = $date . "-" . $filetitle . ".md";
if(isset($tags)){
$filecontent = <<<EOD
%TITLE: $title
%DATE: $pubdate
%AUTHOR: $author
%TAGS: $tags
$content
EOD;
} else {
$filecontent = <<<EOD
%TITLE: $title
%DATE: $pubdate
%AUTHOR: $author
$content
EOD;
}
// Make a output directory
if(!(file_exists("articles"))) {
mkdir("articles");
}
// Save the file
$handle = fopen("articles/$filename", "c");
fwrite($handle, $filecontent);
fclose($handle);
}
$i++;
}
} else {
help();
}
function help() {
$help = <<<EOD
blogger2rangitaki
A small PHP script which converts a Blogger XML export to Rangitaki blog posts.
COPYRIGHT © 2015 Rangitaki Project
MIT License
Usage:
./blogger2rangitaki filename.xml
Where filename.xml is the Blogger export.
The articels are saved in articles/
EOD;
echo $help;
}

View file

@ -0,0 +1,5 @@
{
"require": {
"league/html-to-markdown": "~4.0"
}
}

78
blogger2rangitaki/composer.lock generated Normal file
View file

@ -0,0 +1,78 @@
{
"_readme": [
"This file locks the dependencies of your project to a known state",
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file",
"This file is @generated automatically"
],
"hash": "58637a0fe75a453726c4f248ef834809",
"packages": [
{
"name": "league/html-to-markdown",
"version": "4.0.0",
"source": {
"type": "git",
"url": "https://github.com/thephpleague/html-to-markdown.git",
"reference": "16f0fe21c60e8a76a51bdf3b256cb7a54bb1cac4"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/thephpleague/html-to-markdown/zipball/16f0fe21c60e8a76a51bdf3b256cb7a54bb1cac4",
"reference": "16f0fe21c60e8a76a51bdf3b256cb7a54bb1cac4",
"shasum": ""
},
"require": {
"ext-dom": "*",
"ext-xml": "*",
"php": ">=5.3.3"
},
"require-dev": {
"phpunit/phpunit": "4.*",
"scrutinizer/ocular": "~1.1"
},
"type": "library",
"extra": {
"branch-alias": {
"dev-master": "4.1-dev"
}
},
"autoload": {
"psr-4": {
"League\\HTMLToMarkdown\\": "src/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Colin O'Dell",
"email": "colinodell@gmail.com",
"homepage": "http://www.colinodell.com",
"role": "Lead Developer"
},
{
"name": "Nick Cernis",
"email": "nick@cern.is",
"homepage": "http://modernnerd.net",
"role": "Original Author"
}
],
"description": "An HTML-to-markdown conversion helper for PHP",
"homepage": "https://github.com/thephpleague/html-to-markdown",
"keywords": [
"html",
"markdown"
],
"time": "2015-07-25 16:38:14"
}
],
"packages-dev": [],
"aliases": [],
"minimum-stability": "stable",
"stability-flags": [],
"prefer-stable": false,
"prefer-lowest": false,
"platform": [],
"platform-dev": []
}

7
blogger2rangitaki/vendor/autoload.php vendored Normal file
View file

@ -0,0 +1,7 @@
<?php
// autoload.php @generated by Composer
require_once __DIR__ . '/composer' . '/autoload_real.php';
return ComposerAutoloaderInit7f8e28eb5836e5d023a7e972b858cf6e::getLoader();

View file

@ -0,0 +1,413 @@
<?php
/*
* This file is part of Composer.
*
* (c) Nils Adermann <naderman@naderman.de>
* Jordi Boggiano <j.boggiano@seld.be>
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
namespace Composer\Autoload;
/**
* ClassLoader implements a PSR-0 class loader
*
* See https://github.com/php-fig/fig-standards/blob/master/accepted/PSR-0.md
*
* $loader = new \Composer\Autoload\ClassLoader();
*
* // register classes with namespaces
* $loader->add('Symfony\Component', __DIR__.'/component');
* $loader->add('Symfony', __DIR__.'/framework');
*
* // activate the autoloader
* $loader->register();
*
* // to enable searching the include path (eg. for PEAR packages)
* $loader->setUseIncludePath(true);
*
* In this example, if you try to use a class in the Symfony\Component
* namespace or one of its children (Symfony\Component\Console for instance),
* the autoloader will first look for the class under the component/
* directory, and it will then fallback to the framework/ directory if not
* found before giving up.
*
* This class is loosely based on the Symfony UniversalClassLoader.
*
* @author Fabien Potencier <fabien@symfony.com>
* @author Jordi Boggiano <j.boggiano@seld.be>
*/
class ClassLoader
{
// PSR-4
private $prefixLengthsPsr4 = array();
private $prefixDirsPsr4 = array();
private $fallbackDirsPsr4 = array();
// PSR-0
private $prefixesPsr0 = array();
private $fallbackDirsPsr0 = array();
private $useIncludePath = false;
private $classMap = array();
private $classMapAuthoritative = false;
public function getPrefixes()
{
if (!empty($this->prefixesPsr0)) {
return call_user_func_array('array_merge', $this->prefixesPsr0);
}
return array();
}
public function getPrefixesPsr4()
{
return $this->prefixDirsPsr4;
}
public function getFallbackDirs()
{
return $this->fallbackDirsPsr0;
}
public function getFallbackDirsPsr4()
{
return $this->fallbackDirsPsr4;
}
public function getClassMap()
{
return $this->classMap;
}
/**
* @param array $classMap Class to filename map
*/
public function addClassMap(array $classMap)
{
if ($this->classMap) {
$this->classMap = array_merge($this->classMap, $classMap);
} else {
$this->classMap = $classMap;
}
}
/**
* Registers a set of PSR-0 directories for a given prefix, either
* appending or prepending to the ones previously set for this prefix.
*
* @param string $prefix The prefix
* @param array|string $paths The PSR-0 root directories
* @param bool $prepend Whether to prepend the directories
*/
public function add($prefix, $paths, $prepend = false)
{
if (!$prefix) {
if ($prepend) {
$this->fallbackDirsPsr0 = array_merge(
(array) $paths,
$this->fallbackDirsPsr0
);
} else {
$this->fallbackDirsPsr0 = array_merge(
$this->fallbackDirsPsr0,
(array) $paths
);
}
return;
}
$first = $prefix[0];
if (!isset($this->prefixesPsr0[$first][$prefix])) {
$this->prefixesPsr0[$first][$prefix] = (array) $paths;
return;
}
if ($prepend) {
$this->prefixesPsr0[$first][$prefix] = array_merge(
(array) $paths,
$this->prefixesPsr0[$first][$prefix]
);
} else {
$this->prefixesPsr0[$first][$prefix] = array_merge(
$this->prefixesPsr0[$first][$prefix],
(array) $paths
);
}
}
/**
* Registers a set of PSR-4 directories for a given namespace, either
* appending or prepending to the ones previously set for this namespace.
*
* @param string $prefix The prefix/namespace, with trailing '\\'
* @param array|string $paths The PSR-0 base directories
* @param bool $prepend Whether to prepend the directories
*
* @throws \InvalidArgumentException
*/
public function addPsr4($prefix, $paths, $prepend = false)
{
if (!$prefix) {
// Register directories for the root namespace.
if ($prepend) {
$this->fallbackDirsPsr4 = array_merge(
(array) $paths,
$this->fallbackDirsPsr4
);
} else {
$this->fallbackDirsPsr4 = array_merge(
$this->fallbackDirsPsr4,
(array) $paths
);
}
} elseif (!isset($this->prefixDirsPsr4[$prefix])) {
// Register directories for a new namespace.
$length = strlen($prefix);
if ('\\' !== $prefix[$length - 1]) {
throw new \InvalidArgumentException("A non-empty PSR-4 prefix must end with a namespace separator.");
}
$this->prefixLengthsPsr4[$prefix[0]][$prefix] = $length;
$this->prefixDirsPsr4[$prefix] = (array) $paths;
} elseif ($prepend) {
// Prepend directories for an already registered namespace.
$this->prefixDirsPsr4[$prefix] = array_merge(
(array) $paths,
$this->prefixDirsPsr4[$prefix]
);
} else {
// Append directories for an already registered namespace.
$this->prefixDirsPsr4[$prefix] = array_merge(
$this->prefixDirsPsr4[$prefix],
(array) $paths
);
}
}
/**
* Registers a set of PSR-0 directories for a given prefix,
* replacing any others previously set for this prefix.
*
* @param string $prefix The prefix
* @param array|string $paths The PSR-0 base directories
*/
public function set($prefix, $paths)
{
if (!$prefix) {
$this->fallbackDirsPsr0 = (array) $paths;
} else {
$this->prefixesPsr0[$prefix[0]][$prefix] = (array) $paths;
}
}
/**
* Registers a set of PSR-4 directories for a given namespace,
* replacing any others previously set for this namespace.
*
* @param string $prefix The prefix/namespace, with trailing '\\'
* @param array|string $paths The PSR-4 base directories
*
* @throws \InvalidArgumentException
*/
public function setPsr4($prefix, $paths)
{
if (!$prefix) {
$this->fallbackDirsPsr4 = (array) $paths;
} else {
$length = strlen($prefix);
if ('\\' !== $prefix[$length - 1]) {
throw new \InvalidArgumentException("A non-empty PSR-4 prefix must end with a namespace separator.");
}
$this->prefixLengthsPsr4[$prefix[0]][$prefix] = $length;
$this->prefixDirsPsr4[$prefix] = (array) $paths;
}
}
/**
* Turns on searching the include path for class files.
*
* @param bool $useIncludePath
*/
public function setUseIncludePath($useIncludePath)
{
$this->useIncludePath = $useIncludePath;
}
/**
* Can be used to check if the autoloader uses the include path to check
* for classes.
*
* @return bool
*/
public function getUseIncludePath()
{
return $this->useIncludePath;
}
/**
* Turns off searching the prefix and fallback directories for classes
* that have not been registered with the class map.
*
* @param bool $classMapAuthoritative
*/
public function setClassMapAuthoritative($classMapAuthoritative)
{
$this->classMapAuthoritative = $classMapAuthoritative;
}
/**
* Should class lookup fail if not found in the current class map?
*
* @return bool
*/
public function isClassMapAuthoritative()
{
return $this->classMapAuthoritative;
}
/**
* Registers this instance as an autoloader.
*
* @param bool $prepend Whether to prepend the autoloader or not
*/
public function register($prepend = false)
{
spl_autoload_register(array($this, 'loadClass'), true, $prepend);
}
/**
* Unregisters this instance as an autoloader.
*/
public function unregister()
{
spl_autoload_unregister(array($this, 'loadClass'));
}
/**
* Loads the given class or interface.
*
* @param string $class The name of the class
* @return bool|null True if loaded, null otherwise
*/
public function loadClass($class)
{
if ($file = $this->findFile($class)) {
includeFile($file);
return true;
}
}
/**
* Finds the path to the file where the class is defined.
*
* @param string $class The name of the class
*
* @return string|false The path if found, false otherwise
*/
public function findFile($class)
{
// work around for PHP 5.3.0 - 5.3.2 https://bugs.php.net/50731
if ('\\' == $class[0]) {
$class = substr($class, 1);
}
// class map lookup
if (isset($this->classMap[$class])) {
return $this->classMap[$class];
}
if ($this->classMapAuthoritative) {
return false;
}
$file = $this->findFileWithExtension($class, '.php');
// Search for Hack files if we are running on HHVM
if ($file === null && defined('HHVM_VERSION')) {
$file = $this->findFileWithExtension($class, '.hh');
}
if ($file === null) {
// Remember that this class does not exist.
return $this->classMap[$class] = false;
}
return $file;
}
private function findFileWithExtension($class, $ext)
{
// PSR-4 lookup
$logicalPathPsr4 = strtr($class, '\\', DIRECTORY_SEPARATOR) . $ext;
$first = $class[0];
if (isset($this->prefixLengthsPsr4[$first])) {
foreach ($this->prefixLengthsPsr4[$first] as $prefix => $length) {
if (0 === strpos($class, $prefix)) {
foreach ($this->prefixDirsPsr4[$prefix] as $dir) {
if (file_exists($file = $dir . DIRECTORY_SEPARATOR . substr($logicalPathPsr4, $length))) {
return $file;
}
}
}
}
}
// PSR-4 fallback dirs
foreach ($this->fallbackDirsPsr4 as $dir) {
if (file_exists($file = $dir . DIRECTORY_SEPARATOR . $logicalPathPsr4)) {
return $file;
}
}
// PSR-0 lookup
if (false !== $pos = strrpos($class, '\\')) {
// namespaced class name
$logicalPathPsr0 = substr($logicalPathPsr4, 0, $pos + 1)
. strtr(substr($logicalPathPsr4, $pos + 1), '_', DIRECTORY_SEPARATOR);
} else {
// PEAR-like class name
$logicalPathPsr0 = strtr($class, '_', DIRECTORY_SEPARATOR) . $ext;
}
if (isset($this->prefixesPsr0[$first])) {
foreach ($this->prefixesPsr0[$first] as $prefix => $dirs) {
if (0 === strpos($class, $prefix)) {
foreach ($dirs as $dir) {
if (file_exists($file = $dir . DIRECTORY_SEPARATOR . $logicalPathPsr0)) {
return $file;
}
}
}
}
}
// PSR-0 fallback dirs
foreach ($this->fallbackDirsPsr0 as $dir) {
if (file_exists($file = $dir . DIRECTORY_SEPARATOR . $logicalPathPsr0)) {
return $file;
}
}
// PSR-0 include paths.
if ($this->useIncludePath && $file = stream_resolve_include_path($logicalPathPsr0)) {
return $file;
}
}
}
/**
* Scope isolated include.
*
* Prevents access to $this/self from included files.
*/
function includeFile($file)
{
include $file;
}

View file

@ -0,0 +1,9 @@
<?php
// autoload_classmap.php @generated by Composer
$vendorDir = dirname(dirname(__FILE__));
$baseDir = dirname($vendorDir);
return array(
);

View file

@ -0,0 +1,9 @@
<?php
// autoload_namespaces.php @generated by Composer
$vendorDir = dirname(dirname(__FILE__));
$baseDir = dirname($vendorDir);
return array(
);

View file

@ -0,0 +1,10 @@
<?php
// autoload_psr4.php @generated by Composer
$vendorDir = dirname(dirname(__FILE__));
$baseDir = dirname($vendorDir);
return array(
'League\\HTMLToMarkdown\\' => array($vendorDir . '/league/html-to-markdown/src'),
);

View file

@ -0,0 +1,50 @@
<?php
// autoload_real.php @generated by Composer
class ComposerAutoloaderInit7f8e28eb5836e5d023a7e972b858cf6e
{
private static $loader;
public static function loadClassLoader($class)
{
if ('Composer\Autoload\ClassLoader' === $class) {
require __DIR__ . '/ClassLoader.php';
}
}
public static function getLoader()
{
if (null !== self::$loader) {
return self::$loader;
}
spl_autoload_register(array('ComposerAutoloaderInit7f8e28eb5836e5d023a7e972b858cf6e', 'loadClassLoader'), true, true);
self::$loader = $loader = new \Composer\Autoload\ClassLoader();
spl_autoload_unregister(array('ComposerAutoloaderInit7f8e28eb5836e5d023a7e972b858cf6e', 'loadClassLoader'));
$map = require __DIR__ . '/autoload_namespaces.php';
foreach ($map as $namespace => $path) {
$loader->set($namespace, $path);
}
$map = require __DIR__ . '/autoload_psr4.php';
foreach ($map as $namespace => $path) {
$loader->setPsr4($namespace, $path);
}
$classMap = require __DIR__ . '/autoload_classmap.php';
if ($classMap) {
$loader->addClassMap($classMap);
}
$loader->register(true);
return $loader;
}
}
function composerRequire7f8e28eb5836e5d023a7e972b858cf6e($file)
{
require $file;
}

View file

@ -0,0 +1,64 @@
[
{
"name": "league/html-to-markdown",
"version": "4.0.0",
"version_normalized": "4.0.0.0",
"source": {
"type": "git",
"url": "https://github.com/thephpleague/html-to-markdown.git",
"reference": "16f0fe21c60e8a76a51bdf3b256cb7a54bb1cac4"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/thephpleague/html-to-markdown/zipball/16f0fe21c60e8a76a51bdf3b256cb7a54bb1cac4",
"reference": "16f0fe21c60e8a76a51bdf3b256cb7a54bb1cac4",
"shasum": ""
},
"require": {
"ext-dom": "*",
"ext-xml": "*",
"php": ">=5.3.3"
},
"require-dev": {
"phpunit/phpunit": "4.*",
"scrutinizer/ocular": "~1.1"
},
"time": "2015-07-25 16:38:14",
"type": "library",
"extra": {
"branch-alias": {
"dev-master": "4.1-dev"
}
},
"installation-source": "dist",
"autoload": {
"psr-4": {
"League\\HTMLToMarkdown\\": "src/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Colin O'Dell",
"email": "colinodell@gmail.com",
"homepage": "http://www.colinodell.com",
"role": "Lead Developer"
},
{
"name": "Nick Cernis",
"email": "nick@cern.is",
"homepage": "http://modernnerd.net",
"role": "Original Author"
}
],
"description": "An HTML-to-markdown conversion helper for PHP",
"homepage": "https://github.com/thephpleague/html-to-markdown",
"keywords": [
"html",
"markdown"
]
}
]

View file

@ -0,0 +1,12 @@
preset: recommended
enabled:
- concat_with_spaces
- strict
disabled:
- concat_without_spaces
- phpdoc_short_description
- psr0
- short_array_syntax

View file

@ -0,0 +1,120 @@
# Change Log
All notable changes to this project will be documented in this file.
Updates should follow the [Keep a CHANGELOG](http://keepachangelog.com/) principles.
## [Unreleased][unreleased]
## [4.0.0]
This release changes the visibility of several methods/properties. #42 and #43 brought to light that some visiblities were
not ideally set, so this releases fixes that. Moving forwards this should reduce the chance of introducing BC-breaking changes.
### Added
- Added new `HtmlConverter::getEnvironment()` method to expose the `Environment` (#42, #43)
### Changed
- Changed `Environment::addConverter()` from `protected` to `public`, enabling custom converters to be added (#42, #43)
- Changed `HtmlConverter::createDOMDocument()` from `protected` to `private`
- Changed `Element::nextCached` from `protected` to `private`
- Made the `Environment` class `final`
## [3.1.1]
### Fixed
- Empty HTML strings now result in empty Markdown documents (#40, #41)
## [3.1.0]
### Added
- Added new `equals` method to `Element` to check for equality
### Changes
- Use Linux line endings consistently instead of plaform-specific line endings (#36)
### Fixed
- Cleaned up code style
## [3.0.0]
### Changed
- Changed namespace to `League\HTMLToMarkdown`
- Changed packagist name to `league/html-to-markdown`
- Re-organized code into several separate classes
- `<a>` tags with identical href and inner text are now rendered using angular bracket syntax (#31)
- `<div>` elements are now treated as block-level elements (#33)
## [2.2.2]
### Added
- Added support for PHP 5.6 and HHVM
- Enabled testing against PHP 7 nightlies
- Added this CHANGELOG.md
### Fixed
- Fixed whitespace preservation between inline elements (#9 and #10)
## [2.2.1]
### Fixed
- Preserve placeholder links (#22)
## [2.2.0]
### Added
- Added CircleCI config
### Changed
- `<pre>` blocks are now treated as code elements
### Removed
- Dropped support for PHP 5.2
- Removed incorrect README comment regarding `#text` nodes (#17)
## [2.1.2]
### Added
- Added the ability to blacklist/remove specific node types (#11)
### Changed
- Line breaks are now placed after divs instead of before them
- Newlines inside of link texts are now removed
- Updated the minimum PHPUnit version to 4.*
## [2.1.1]
### Added
- Added options to customize emphasis characters
## [2.1.0]
### Added
- Added option to strip HTML tags without Markdown equivalents
- Added `convert()` method for converter reuse
- Added ability to set options after instance construction
- Documented the required PHP extensions (#4)
### Changed
- ATX style now used for h1 and h2 tags inside blockquotes
### Fixed
- Newlines inside blockquotes are now started with a bracket
- Fixed some incorrect docblocks
- `__toString()` now returns an empty string if input is empty
- Convert head tag if body tag is empty (#7)
- Preserve special characters inside tags without md equivalents (#6)
## [2.0.1]
### Fixed
- Fixed first line indentation for multi-line code blocks
- Fixed consecutive anchors get separating spaces stripped (#3)
## [2.0.0]
### Added
- Initial release
[unreleased]: https://github.com/thephpleague/html-to-markdown/compare/4.0.0...master
[4.0.0]: https://github.com/thephpleague/html-to-markdown/compare/3.1.1...4.0.0
[3.1.1]: https://github.com/thephpleague/html-to-markdown/compare/3.1.0...3.1.1
[3.1.0]: https://github.com/thephpleague/html-to-markdown/compare/3.0.0...3.1.0
[3.0.0]: https://github.com/thephpleague/html-to-markdown/compare/2.2.2...3.0.0
[2.2.2]: https://github.com/thephpleague/html-to-markdown/compare/2.2.1...2.2.2
[2.2.1]: https://github.com/thephpleague/html-to-markdown/compare/2.2.0...2.2.1
[2.2.0]: https://github.com/thephpleague/html-to-markdown/compare/2.1.2...2.2.0
[2.1.2]: https://github.com/thephpleague/html-to-markdown/compare/2.1.1...2.1.2
[2.1.1]: https://github.com/thephpleague/html-to-markdown/compare/2.1.0...2.1.1
[2.1.0]: https://github.com/thephpleague/html-to-markdown/compare/2.0.1...2.1.0
[2.0.1]: https://github.com/thephpleague/html-to-markdown/compare/2.0.0...2.0.1
[2.0.0]: https://github.com/thephpleague/html-to-markdown/compare/775f91e...2.0.0

View file

@ -0,0 +1,32 @@
# Contributing
Contributions are **welcome** and will be fully **credited**.
We accept contributions via Pull Requests on [Github](https://github.com/thephpleague/html-to-markdown).
## Pull Requests
- **[PSR-2 Coding Standard](https://github.com/php-fig/fig-standards/blob/master/accepted/PSR-2-coding-style-guide.md)** - The easiest way to apply the conventions is to install [PHP Code Sniffer](http://pear.php.net/package/PHP_CodeSniffer).
- **Add tests!** - Your patch won't be accepted if it doesn't have tests.
- **Document any change in behaviour** - Make sure the `README.md` and any other relevant documentation are kept up-to-date.
- **Consider our release cycle** - We try to follow [SemVer v2.0.0](http://semver.org/). Randomly breaking public APIs is not an option.
- **Create feature branches** - Don't ask us to pull from your master branch.
- **One pull request per feature** - If you want to do more than one thing, send multiple pull requests.
- **Send coherent history** - Make sure each individual commit in your pull request is meaningful. If you had to make multiple intermediate commits while developing, please [squash them](http://www.git-scm.com/book/en/v2/Git-Tools-Rewriting-History#Changing-Multiple-Commit-Messages) before submitting.
## Running Tests
``` bash
$ ./vendor/bin/phpunit
```
**Happy coding**!

View file

@ -0,0 +1,22 @@
The MIT License (MIT)
Copyright (c) 2015 Colin O'Dell
Originally created by Nick Cernis
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

View file

@ -0,0 +1,151 @@
HTML To Markdown for PHP
========================
[![Latest Version](https://img.shields.io/packagist/v/league/html-to-markdown.svg?style=flat-square)](https://packagist.org/packages/league/html-to-markdown)
[![Software License](http://img.shields.io/badge/license-MIT-brightgreen.svg?style=flat-square)](LICENSE)
[![Build Status](https://img.shields.io/travis/thephpleague/html-to-markdown/master.svg?style=flat-square)](https://travis-ci.org/thephpleague/html-to-markdown)
[![Coverage Status](https://img.shields.io/scrutinizer/coverage/g/thephpleague/html-to-markdown.svg?style=flat-square)](https://scrutinizer-ci.com/g/thephpleague/html-to-markdown/code-structure)
[![Quality Score](https://img.shields.io/scrutinizer/g/thephpleague/html-to-markdown.svg?style=flat-square)](https://scrutinizer-ci.com/g/thephpleague/html-to-markdown)
[![Total Downloads](https://img.shields.io/packagist/dt/league/html-to-markdown.svg?style=flat-square)](https://packagist.org/packages/league/html-to-markdown)
Library which converts HTML to [Markdown](http://daringfireball.net/projects/markdown/) for your sanity and convenience.
**Requires**: PHP 5.3+
**Lead Developer**: [@colinodell](http://twitter.com/colinodell)
**Original Author**: [@nickcernis](http://twitter.com/nickcernis)
### Why convert HTML to Markdown?
*"What alchemy is this?"* you mutter. *"I can see why you'd convert [Markdown to HTML](https://github.com/thephpleague/commonmark),"* you continue, already labouring the question somewhat, *"but why go the other way?"*
Typically you would convert HTML to Markdown if:
1. You have an existing HTML document that needs to be edited by people with good taste.
2. You want to store new content in HTML format but edit it as Markdown.
3. You want to convert HTML email to plain text email.
4. You know a guy who's been converting HTML to Markdown for years, and now he can speak Elvish. You'd quite like to be able to speak Elvish.
5. You just really like Markdown.
### How to use it
Require the library in your composer.json:
{
"require": {
"league/html-to-markdown": "~4.0"
}
}
Then `composer install` and add `require 'vendor/autoload.php';` to the top of your script.
Next, create a new HtmlConverter instance, passing in your valid HTML code to its `convert()` function:
use League\HTMLToMarkdown\HtmlConverter;
$converter = new HtmlConverter();
$html = "<h3>Quick, to the Batpoles!</h3>";
$markdown = $converter->convert($html);
The `$markdown` variable now contains the Markdown version of your HTML as a string:
echo $markdown; // ==> ### Quick, to the Batpoles!
The included `demo` directory contains an HTML->Markdown conversion form to try out.
### Conversion options
By default, HTML To Markdown preserves HTML tags without Markdown equivalents, like `<span>` and `<div>`.
To strip HTML tags that don't have a Markdown equivalent while preserving the content inside them, set `strip_tags` to true, like this:
$converter = new HtmlConverter(array('strip_tags' => true));
$html = '<span>Turnips!</span>';
$markdown = $converter->convert($html); // $markdown now contains "Turnips!"
Or more explicitly, like this:
$converter = new HtmlConverter();
$converter->setOption('strip_tags', true);
$html = '<span>Turnips!</span>';
$markdown = $converter->convert($html); // $markdown now contains "Turnips!"
Note that only the tags themselves are stripped, not the content they hold.
To strip tags and their content, pass a space-separated list of tags in `remove_nodes`, like this:
$converter = new HtmlConverter(array('remove_nodes' => 'span div'));
$html = '<span>Turnips!</span><div>Monkeys!</div>';
$markdown = $converter->convert($html); // $markdown now contains ""
### Style options
Bold and italic tags are converted using the asterisk syntax by default. Change this to the underlined syntax using the `bold_style` and `italic_style` options.
$converter = new HtmlConverter();
$converter->setOption('italic_style', '_');
$converter->setOption('bold_style', '__');
$html = '<em>Italic</em> and a <strong>bold</strong>';
$markdown = $converter->convert($html); // $markdown now contains "_Italic_ and a __bold__"
### Limitations
- Markdown Extra, MultiMarkdown and other variants aren't supported just Markdown.
### Known issues
- Nested lists and lists containing multiple paragraphs aren't converted correctly.
- Lists inside blockquotes aren't converted correctly.
- Any reported [open issues here](https://github.com/thephpleague/html-to-markdown/issues?state=open).
[Report your issue or request a feature here.](https://github.com/thephpleague/html-to-markdown/issues/new) Issues with patches or failing tests are especially welcome.
### Style notes
- Setext (underlined) headers are the default for H1 and H2. If you prefer the ATX style for H1 and H2 (# Header 1 and ## Header 2), set `header_style` to 'atx' in the options array when you instantiate the object:
`$converter = new HtmlConverter(array('header_style'=>'atx'));`
Headers of H3 priority and lower always use atx style.
- Links and images are referenced inline. Footnote references (where image src and anchor href attributes are listed in the footnotes) are not used.
- Blockquotes aren't line wrapped it makes the converted Markdown easier to edit.
### Dependencies
HTML To Markdown requires PHP's [xml](http://www.php.net/manual/en/xml.installation.php), [lib-xml](http://www.php.net/manual/en/libxml.installation.php), and [dom](http://www.php.net/manual/en/dom.installation.php) extensions, all of which are enabled by default on most distributions.
Errors such as "Fatal error: Class 'DOMDocument' not found" on distributions such as CentOS that disable PHP's xml extension can be resolved by installing php-xml.
### Contributors
Many thanks to all [contributors](https://github.com/thephpleague/html-to-markdown/graphs/contributors) so far. Further improvements and feature suggestions are very welcome.
### How it works
HTML To Markdown creates a DOMDocument from the supplied HTML, walks through the tree, and converts each node to a text node containing the equivalent markdown, starting from the most deeply nested node and working inwards towards the root node.
### To-do
- Support for nested lists and lists inside blockquotes.
- Offer an option to preserve tags as HTML if they contain attributes that can't be represented with Markdown (e.g. `style`).
### Trying to convert Markdown to HTML?
Use one of these great libraries:
- [league/commonmark](https://github.com/thephpleague/commonmark) (recommended)
- [cebe/markdown](https://github.com/cebe/markdown)
- [PHP Markdown](https://michelf.ca/projects/php-markdown/)
- [Parsedown](https://github.com/erusev/parsedown)
No guarantees about the Elvish, though.

View file

@ -0,0 +1,46 @@
{
"name": "league/html-to-markdown",
"type": "library",
"description": "An HTML-to-markdown conversion helper for PHP",
"keywords": ["markdown", "html"],
"homepage": "https://github.com/thephpleague/html-to-markdown",
"license": "MIT",
"authors": [
{
"name": "Colin O'Dell",
"email": "colinodell@gmail.com",
"homepage": "http://www.colinodell.com",
"role": "Lead Developer"
},
{
"name": "Nick Cernis",
"email": "nick@cern.is",
"homepage": "http://modernnerd.net",
"role": "Original Author"
}
],
"autoload": {
"psr-4": {
"League\\HTMLToMarkdown\\": "src/"
}
},
"autoload-dev": {
"psr-4": {
"League\\HTMLToMarkdown\\Test\\": "tests"
}
},
"require": {
"php": ">=5.3.3",
"ext-dom": "*",
"ext-xml": "*"
},
"require-dev": {
"phpunit/phpunit": "4.*",
"scrutinizer/ocular": "~1.1"
},
"extra": {
"branch-alias": {
"dev-master": "4.1-dev"
}
}
}

View file

@ -0,0 +1,180 @@
<?php
require_once dirname(__FILE__) . '/../vendor/autoload.php';
$markdown = '';
$html = ($_POST) ? $_POST['html'] : null;
if (!is_null($html)) {
if (get_magic_quotes_gpc()) {
$html = stripslashes($html);
}
$markdown = new \League\HTMLToMarkdown\HtmlConverter($html);
}
?>
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<title>HTML To Markdown Demo</title>
<style>
body {
font-family: helvetica, arial, sans-serif;
}
</style>
</head>
<body>
<div style="width:50%;float:left;">
<h3>HTML</h3>
<form method="post" action="">
<?php if (!is_null($html)): ?>
<textarea rows="30" style="width:95%" name="html" id="html"><?php echo $html ?></textarea><br/>
<?php else: ?>
<textarea rows="30" style="width:95%" name="html" id="html">
<h1>A level one header</h1>
<p>Some paragraph text&#8482; containing &ldquo;UTF-8&rdquo; chars&hellip;</p>
<h2>A longer level two header</h2>
<h3>Here's a <em>level 3</em> title</h3>
<p>Some text containing<br/>a forced break.</p>
<p>Some text containing an
unforced break.</p>
<h2>Blockquotes and horizontal rules</h2>
<blockquote>Here's a blockquote</blockquote>
<hr/>
<blockquote>
<p>This should have a single arrow.</p>
<blockquote>
<p>A blockquote inside a blockquote, with a double arrow, on a new line.</p>
</blockquote>
</blockquote>
<hr/>
<blockquote>
<p>A multi-paragraph blockquote.</p>
<p>Here's the second paragraph. (Should be inside blockquote.)</p>
<h4>A header inside a blockquote</h4>
<p><img src="/path/img.jpg" alt="Image in a blockquote" title="Image in a blockquote"/></p>
<ul>
<li>List in a blockquote</li>
<li>Second list item</li>
</ul>
</blockquote>
<h2>Lists</h2>
<ul>
<li>An unordered list</li>
<li>Appears with hyphens</li>
</ul>
<ol>
<li>An ordered list</li>
<li>Appears with numbers.</li>
<li>Automatically indexed.</li>
</ol>
<h2>Links and images</h2>
<p><img src="/path/img.jpg" alt="alt text" title="Title"/></p>
<p>An example of a <a href="http://url.com/" title="Title">link.</a></p>
<p>An image inside a link:<br/>
<a href="http://url.com/" title="Title"><img src="/path/img.jpg" alt="alt text" title="Title"/></a>
</p>
<h2>Inline elements</h2>
<p><em>This text is in italics.</em></p>
<p><strong>This text is in bold.</strong></p>
<p>An <em>em</em> and a <strong>strong</strong> inside a paragraph.</p>
<p>A <em><span>span</span> inside</em> an em.</p>
<p>A <em><strong>strong</strong> inside</em> an em.</p>
<p>A <span><strong>strong</strong> inside</span> a span.</p>
<h2>Code blocks and spans</h2>
<p><code>
#sidebar h1 {
font-size: 1.5em;
font-weight: bold;
}
</code></p>
<p><code>A <strong>code</strong> span</code></p>
<h2>Bugs (tests from here on fail)</h2>
<h4>A list with multiple paragraphs</h4>
<ul>
<li><p>A list item.</p>
<p>With multiple paragraphs.</p></li>
<li>List item two.</li>
</ul>
<h4>Mixed ordered and unordered nested lists</h4>
<ul>
<li>List 1
<ul>
<li>List 2</li>
</ul>
</li>
<li>List 1b
<ol>
<li>List 3a</li>
<li>List 3b
<ul>
<li>List 4</li>
</ul>
</li>
<li>List 3c</li>
</ol>
</li>
<li>List 1c</li>
</ul>
</textarea>
<?php endif; ?>
<input type="submit" value="Convert HTML to Markdown >>" name="submit">
</form>
</div>
<div style="width:50%;float:right;">
<h3>Markdown</h3>
<textarea rows="30" style="width:95%; font-family:monospace;" name="markdown" id="markdown"
style="font-family:monospace"><?php
echo htmlspecialchars($markdown); ?></textarea><br/>
</div>
<div style="clear:both;"></div>
<p>
<small><a href="https://github.com/thephpleague/html-to-markdown">HTML To Markdown</a> is a library to convert HTML into Markdown with PHP.</small>
</p>
</body>
</html>

View file

@ -0,0 +1,60 @@
<?php
namespace League\HTMLToMarkdown;
class Configuration
{
protected $config;
/**
* @param array $config
*/
public function __construct(array $config = array())
{
$this->config = $config;
}
/**
* @param array $config
*/
public function merge(array $config = array())
{
$this->config = array_replace_recursive($this->config, $config);
}
/**
* @param array $config
*/
public function replace(array $config = array())
{
$this->config = $config;
}
/**
* @param string $key
* @param mixed $value
*/
public function setOption($key, $value)
{
$this->config[$key] = $value;
}
/**
* @param string|null $key
* @param mixed|null $default
*
* @return mixed|null
*/
public function getOption($key = null, $default = null)
{
if ($key === null) {
return $this->config;
}
if (!isset($this->config[$key])) {
return $default;
}
return $this->config[$key];
}
}

View file

@ -0,0 +1,11 @@
<?php
namespace League\HTMLToMarkdown;
interface ConfigurationAwareInterface
{
/**
* @param Configuration $config
*/
public function setConfig(Configuration $config);
}

View file

@ -0,0 +1,44 @@
<?php
namespace League\HTMLToMarkdown\Converter;
use League\HTMLToMarkdown\ElementInterface;
class BlockquoteConverter implements ConverterInterface
{
/**
* @param ElementInterface $element
*
* @return string
*/
public function convert(ElementInterface $element)
{
// Contents should have already been converted to Markdown by this point,
// so we just need to add '>' symbols to each line.
$markdown = '';
$quote_content = trim($element->getValue());
$lines = preg_split('/\r\n|\r|\n/', $quote_content);
$total_lines = count($lines);
foreach ($lines as $i => $line) {
$markdown .= '> ' . $line . "\n";
if ($i + 1 === $total_lines) {
$markdown .= "\n";
}
}
return $markdown;
}
/**
* @return string[]
*/
public function getSupportedTags()
{
return array('blockquote');
}
}

View file

@ -0,0 +1,26 @@
<?php
namespace League\HTMLToMarkdown\Converter;
use League\HTMLToMarkdown\ElementInterface;
class CommentConverter implements ConverterInterface
{
/**
* @param ElementInterface $element
*
* @return string
*/
public function convert(ElementInterface $element)
{
return '';
}
/**
* @return string[]
*/
public function getSupportedTags()
{
return array('#comment');
}
}

View file

@ -0,0 +1,20 @@
<?php
namespace League\HTMLToMarkdown\Converter;
use League\HTMLToMarkdown\ElementInterface;
interface ConverterInterface
{
/**
* @param ElementInterface $element
*
* @return string
*/
public function convert(ElementInterface $element);
/**
* @return string[]
*/
public function getSupportedTags();
}

View file

@ -0,0 +1,50 @@
<?php
namespace League\HTMLToMarkdown\Converter;
use League\HTMLToMarkdown\Configuration;
use League\HTMLToMarkdown\ConfigurationAwareInterface;
use League\HTMLToMarkdown\ElementInterface;
class DefaultConverter implements ConverterInterface, ConfigurationAwareInterface
{
const DEFAULT_CONVERTER = '_default';
/**
* @var Configuration
*/
protected $config;
/**
* @param Configuration $config
*/
public function setConfig(Configuration $config)
{
$this->config = $config;
}
/**
* @param ElementInterface $element
*
* @return string
*/
public function convert(ElementInterface $element)
{
// If strip_tags is false (the default), preserve tags that don't have Markdown equivalents,
// such as <span> nodes on their own. C14N() canonicalizes the node to a string.
// See: http://www.php.net/manual/en/domnode.c14n.php
if ($this->config->getOption('strip_tags', false)) {
return $element->getValue();
}
return html_entity_decode($element->getChildrenAsString());
}
/**
* @return string[]
*/
public function getSupportedTags()
{
return array(self::DEFAULT_CONVERTER);
}
}

View file

@ -0,0 +1,45 @@
<?php
namespace League\HTMLToMarkdown\Converter;
use League\HTMLToMarkdown\Configuration;
use League\HTMLToMarkdown\ConfigurationAwareInterface;
use League\HTMLToMarkdown\ElementInterface;
class DivConverter implements ConverterInterface, ConfigurationAwareInterface
{
/**
* @var Configuration
*/
protected $config;
/**
* @param Configuration $config
*/
public function setConfig(Configuration $config)
{
$this->config = $config;
}
/**
* @param ElementInterface $element
*
* @return string
*/
public function convert(ElementInterface $element)
{
if ($this->config->getOption('strip_tags', false)) {
return $element->getValue() . "\n\n";
}
return html_entity_decode($element->getChildrenAsString());
}
/**
* @return string[]
*/
public function getSupportedTags()
{
return array('div');
}
}

View file

@ -0,0 +1,50 @@
<?php
namespace League\HTMLToMarkdown\Converter;
use League\HTMLToMarkdown\Configuration;
use League\HTMLToMarkdown\ConfigurationAwareInterface;
use League\HTMLToMarkdown\ElementInterface;
class EmphasisConverter implements ConverterInterface, ConfigurationAwareInterface
{
/**
* @var Configuration
*/
protected $config;
/**
* @param Configuration $config
*/
public function setConfig(Configuration $config)
{
$this->config = $config;
}
/**
* @param ElementInterface $element
*
* @return string
*/
public function convert(ElementInterface $element)
{
$tag = $element->getTagName();
$value = $element->getValue();
if ($tag === 'i' || $tag === 'em') {
$style = $this->config->getOption('italic_style');
} else {
$style = $this->config->getOption('bold_style');
}
return $style . $value . $style;
}
/**
* @return string[]
*/
public function getSupportedTags()
{
return array('em', 'i', 'strong', 'b');
}
}

View file

@ -0,0 +1,26 @@
<?php
namespace League\HTMLToMarkdown\Converter;
use League\HTMLToMarkdown\ElementInterface;
class HardBreakConverter implements ConverterInterface
{
/**
* @param ElementInterface $element
*
* @return string
*/
public function convert(ElementInterface $element)
{
return " \n";
}
/**
* @return string[]
*/
public function getSupportedTags()
{
return array('br');
}
}

View file

@ -0,0 +1,78 @@
<?php
namespace League\HTMLToMarkdown\Converter;
use League\HTMLToMarkdown\Configuration;
use League\HTMLToMarkdown\ConfigurationAwareInterface;
use League\HTMLToMarkdown\ElementInterface;
class HeaderConverter implements ConverterInterface, ConfigurationAwareInterface
{
const STYLE_ATX = 'atx';
const STYLE_SETEXT = 'setext';
/**
* @var Configuration
*/
protected $config;
/**
* @param Configuration $config
*/
public function setConfig(Configuration $config)
{
$this->config = $config;
}
/**
* @param ElementInterface $element
*
* @return string
*/
public function convert(ElementInterface $element)
{
$level = (int) substr($element->getTagName(), 1, 1);
$style = $this->config->getOption('header_style', self::STYLE_SETEXT);
if (($level === 1 || $level === 2) && !$element->isDescendantOf('blockquote') && $style === self::STYLE_SETEXT) {
return $this->createSetextHeader($level, $element->getValue());
} else {
return $this->createAtxHeader($level, $element->getValue());
}
}
/**
* @return string[]
*/
public function getSupportedTags()
{
return array('h1', 'h2', 'h3', 'h4', 'h5', 'h6');
}
/**
* @param int $level
* @param string $content
*
* @return string
*/
private function createSetextHeader($level, $content)
{
$length = (function_exists('mb_strlen')) ? mb_strlen($content, 'utf-8') : strlen($content);
$underline = ($level === 1) ? '=' : '-';
return $content . "\n" . str_repeat($underline, $length) . "\n\n";
}
/**
* @param int $level
* @param string $content
*
* @return string
*/
private function createAtxHeader($level, $content)
{
$prefix = str_repeat('#', $level) . ' ';
return $prefix . $content . "\n\n";
}
}

View file

@ -0,0 +1,26 @@
<?php
namespace League\HTMLToMarkdown\Converter;
use League\HTMLToMarkdown\ElementInterface;
class HorizontalRuleConverter implements ConverterInterface
{
/**
* @param ElementInterface $element
*
* @return string
*/
public function convert(ElementInterface $element)
{
return "- - - - - -\n\n";
}
/**
* @return string[]
*/
public function getSupportedTags()
{
return array('hr');
}
}

View file

@ -0,0 +1,37 @@
<?php
namespace League\HTMLToMarkdown\Converter;
use League\HTMLToMarkdown\ElementInterface;
class ImageConverter implements ConverterInterface
{
/**
* @param ElementInterface $element
*
* @return string
*/
public function convert(ElementInterface $element)
{
$src = $element->getAttribute('src');
$alt = $element->getAttribute('alt');
$title = $element->getAttribute('title');
if ($title !== '') {
// No newlines added. <img> should be in a block-level element.
$markdown = '![' . $alt . '](' . $src . ' "' . $title . '")';
} else {
$markdown = '![' . $alt . '](' . $src . ')';
}
return $markdown;
}
/**
* @return string[]
*/
public function getSupportedTags()
{
return array('img');
}
}

View file

@ -0,0 +1,42 @@
<?php
namespace League\HTMLToMarkdown\Converter;
use League\HTMLToMarkdown\ElementInterface;
class LinkConverter implements ConverterInterface
{
/**
* @param ElementInterface $element
*
* @return string
*/
public function convert(ElementInterface $element)
{
$href = $element->getAttribute('href');
$title = $element->getAttribute('title');
$text = $element->getValue();
if ($title !== '') {
$markdown = '[' . $text . '](' . $href . ' "' . $title . '")';
} elseif ($href === $text) {
$markdown = '<' . $href . '>';
} else {
$markdown = '[' . $text . '](' . $href . ')';
}
if (!$href) {
$markdown = html_entity_decode($element->getChildrenAsString());
}
return $markdown;
}
/**
* @return string[]
*/
public function getSupportedTags()
{
return array('a');
}
}

View file

@ -0,0 +1,26 @@
<?php
namespace League\HTMLToMarkdown\Converter;
use League\HTMLToMarkdown\ElementInterface;
class ListBlockConverter implements ConverterInterface
{
/**
* @param ElementInterface $element
*
* @return string
*/
public function convert(ElementInterface $element)
{
return $element->getValue() . "\n";
}
/**
* @return string[]
*/
public function getSupportedTags()
{
return array('ol', 'ul');
}
}

View file

@ -0,0 +1,37 @@
<?php
namespace League\HTMLToMarkdown\Converter;
use League\HTMLToMarkdown\ElementInterface;
class ListItemConverter implements ConverterInterface
{
/**
* @param ElementInterface $element
*
* @return string
*/
public function convert(ElementInterface $element)
{
// If parent is an ol, use numbers, otherwise, use dashes
$list_type = $element->getParent()->getTagName();
$value = $element->getValue();
if ($list_type === 'ul') {
$markdown = '- ' . trim($value) . "\n";
} else {
$number = $element->getSiblingPosition();
$markdown = $number . '. ' . trim($value) . "\n";
}
return $markdown;
}
/**
* @return string[]
*/
public function getSupportedTags()
{
return array('li');
}
}

View file

@ -0,0 +1,28 @@
<?php
namespace League\HTMLToMarkdown\Converter;
use League\HTMLToMarkdown\ElementInterface;
class ParagraphConverter implements ConverterInterface
{
/**
* @param ElementInterface $element
*
* @return string
*/
public function convert(ElementInterface $element)
{
$value = $element->getValue();
return (trim($value)) ? rtrim($value) . "\n\n" : '';
}
/**
* @return string[]
*/
public function getSupportedTags()
{
return array('p');
}
}

View file

@ -0,0 +1,73 @@
<?php
namespace League\HTMLToMarkdown\Converter;
use League\HTMLToMarkdown\ElementInterface;
class PreformattedConverter implements ConverterInterface
{
/**
* @param ElementInterface $element
*
* @return string
*/
public function convert(ElementInterface $element)
{
// Store the content of the code block in an array, one entry for each line
$markdown = '';
$code_content = html_entity_decode($element->getChildrenAsString());
$code_content = str_replace(array('<code>', '</code>'), '', $code_content);
$code_content = str_replace(array('<pre>', '</pre>'), '', $code_content);
$lines = preg_split('/\r\n|\r|\n/', $code_content);
$total = count($lines);
// If there's more than one line of code, prepend each line with four spaces and no backticks.
if ($total > 1 || $element->getTagName() === 'pre') {
// Remove the first and last line if they're empty
$first_line = trim($lines[0]);
$last_line = trim($lines[$total - 1]);
$first_line = trim($first_line, '&#xD;'); //trim XML style carriage returns too
$last_line = trim($last_line, '&#xD;');
if (empty($first_line)) {
array_shift($lines);
}
if (empty($last_line)) {
array_pop($lines);
}
$count = 1;
foreach ($lines as $line) {
$line = str_replace('&#xD;', '', $line);
$markdown .= ' ' . $line;
// Add newlines, except final line of the code
if ($count !== $total) {
$markdown .= "\n";
}
$count++;
}
$markdown .= "\n";
} else {
// There's only one line of code. It's a code span, not a block. Just wrap it with backticks.
$markdown .= '`' . $lines[0] . '`';
}
if ($element->getTagName() === 'pre') {
$markdown = "\n" . $markdown . "\n";
}
return $markdown;
}
/**
* @return string[]
*/
public function getSupportedTags()
{
return array('pre', 'code');
}
}

View file

@ -0,0 +1,38 @@
<?php
namespace League\HTMLToMarkdown\Converter;
use League\HTMLToMarkdown\ElementInterface;
class TextConverter implements ConverterInterface
{
/**
* @param ElementInterface $element
*
* @return string
*/
public function convert(ElementInterface $element)
{
$value = $element->getValue();
$markdown = preg_replace('~\s+~', ' ', $value);
$markdown = preg_replace('~^#~', '\\\\#', $markdown);
if ($markdown === ' ') {
$next = $element->getNext();
if (!$next || $next->isBlock()) {
$markdown = '';
}
}
return $markdown;
}
/**
* @return string[]
*/
public function getSupportedTags()
{
return array('#text');
}
}

View file

@ -0,0 +1,234 @@
<?php
namespace League\HTMLToMarkdown;
class Element implements ElementInterface
{
/**
* @var \DOMNode
*/
protected $node;
/**
* @var ElementInterface|null
*/
private $nextCached;
public function __construct(\DOMNode $node)
{
$this->node = $node;
}
/**
* @return bool
*/
public function isBlock()
{
switch ($this->getTagName()) {
case 'blockquote':
case 'body':
case 'code':
case 'div':
case 'h1':
case 'h2':
case 'h3':
case 'h4':
case 'h5':
case 'h6':
case 'hr':
case 'html':
case 'li':
case 'p':
case 'ol':
case 'ul':
return true;
default:
return false;
}
}
/**
* @return bool
*/
public function isText()
{
return $this->getTagName() === '#text';
}
/**
* @return bool
*/
public function isWhitespace()
{
return $this->getTagName() === '#text' && trim($this->getValue()) === '';
}
/**
* @return string
*/
public function getTagName()
{
return $this->node->nodeName;
}
/**
* @return string
*/
public function getValue()
{
return $this->node->nodeValue;
}
/**
* @return ElementInterface|null
*/
public function getParent()
{
return new static($this->node->parentNode) ?: null;
}
/**
* @return bool
*/
public function hasChildren()
{
return $this->node->hasChildNodes();
}
/**
* @return ElementInterface[]
*/
public function getChildren()
{
$ret = array();
/** @var \DOMNode $node */
foreach ($this->node->childNodes as $node) {
$ret[] = new static($node);
}
return $ret;
}
/**
* @return ElementInterface|null
*/
public function getNext()
{
if ($this->nextCached === null) {
$nextNode = $this->getNextNode($this->node);
if ($nextNode !== null) {
$this->nextCached = new static($nextNode);
}
}
return $this->nextCached;
}
/**
* @param \DomNode $node
*
* @return \DomNode|null
*/
private function getNextNode($node, $checkChildren = true)
{
if ($checkChildren && $node->firstChild) {
return $node->firstChild;
} elseif ($node->nextSibling) {
return $node->nextSibling;
} elseif ($node->parentNode) {
return $this->getNextNode($node->parentNode, false);
}
}
/**
* @param string[]|string $tagNames
*
* @return bool
*/
public function isDescendantOf($tagNames)
{
if (!is_array($tagNames)) {
$tagNames = array($tagNames);
}
for ($p = $this->node->parentNode; $p !== false; $p = $p->parentNode) {
if (is_null($p)) {
return false;
}
if (in_array($p->nodeName, $tagNames)) {
return true;
}
}
return false;
}
/**
* @param string $markdown
*/
public function setFinalMarkdown($markdown)
{
$markdown_node = $this->node->ownerDocument->createTextNode($markdown);
$this->node->parentNode->replaceChild($markdown_node, $this->node);
}
/**
* @return string
*/
public function getChildrenAsString()
{
return $this->node->C14N();
}
/**
* @return int
*/
public function getSiblingPosition()
{
$position = 0;
// Loop through all nodes and find the given $node
foreach ($this->getParent()->getChildren() as $current_node) {
if (!$current_node->isWhitespace()) {
$position++;
}
// TODO: Need a less-buggy way of comparing these
// Perhaps we can somehow ensure that we always have the exact same object and use === instead?
if ($this->equals($current_node)) {
break;
}
}
return $position;
}
/**
* @param string $name
*
* @return string
*/
public function getAttribute($name)
{
if ($this->node instanceof \DOMElement) {
return $this->node->getAttribute($name);
}
return '';
}
/**
* @param ElementInterface $element
*
* @return bool
*/
public function equals(ElementInterface $element)
{
if ($element instanceof self) {
return $element->node === $this->node;
}
return $element === $this;
}
}

View file

@ -0,0 +1,80 @@
<?php
namespace League\HTMLToMarkdown;
interface ElementInterface
{
/**
* @return bool
*/
public function isBlock();
/**
* @return bool
*/
public function isText();
/**
* @return bool
*/
public function isWhitespace();
/**
* @return string
*/
public function getTagName();
/**
* @return string
*/
public function getValue();
/**
* @return ElementInterface|null
*/
public function getParent();
/**
* @param string|string[] $tagNames
*
* @return bool
*/
public function isDescendantOf($tagNames);
/**
* @return bool
*/
public function hasChildren();
/**
* @return ElementInterface[]
*/
public function getChildren();
/**
* @return ElementInterface|null
*/
public function getNext();
/**
* @return int
*/
public function getSiblingPosition();
/**
* @return string
*/
public function getChildrenAsString();
/**
* @param string $markdown
*/
public function setFinalMarkdown($markdown);
/**
* @param string $name
*
* @return string
*/
public function getAttribute($name);
}

View file

@ -0,0 +1,102 @@
<?php
namespace League\HTMLToMarkdown;
use League\HTMLToMarkdown\Converter\BlockquoteConverter;
use League\HTMLToMarkdown\Converter\CommentConverter;
use League\HTMLToMarkdown\Converter\ConverterInterface;
use League\HTMLToMarkdown\Converter\DefaultConverter;
use League\HTMLToMarkdown\Converter\DivConverter;
use League\HTMLToMarkdown\Converter\EmphasisConverter;
use League\HTMLToMarkdown\Converter\HardBreakConverter;
use League\HTMLToMarkdown\Converter\HeaderConverter;
use League\HTMLToMarkdown\Converter\HorizontalRuleConverter;
use League\HTMLToMarkdown\Converter\ImageConverter;
use League\HTMLToMarkdown\Converter\LinkConverter;
use League\HTMLToMarkdown\Converter\ListBlockConverter;
use League\HTMLToMarkdown\Converter\ListItemConverter;
use League\HTMLToMarkdown\Converter\ParagraphConverter;
use League\HTMLToMarkdown\Converter\PreformattedConverter;
use League\HTMLToMarkdown\Converter\TextConverter;
final class Environment
{
/**
* @var Configuration
*/
protected $config;
/**
* @var ConverterInterface[]
*/
protected $converters = array();
public function __construct(array $config = array())
{
$this->config = new Configuration($config);
$this->addConverter(new DefaultConverter());
}
/**
* @return Configuration
*/
public function getConfig()
{
return $this->config;
}
/**
* @param ConverterInterface $converter
*/
public function addConverter(ConverterInterface $converter)
{
if ($converter instanceof ConfigurationAwareInterface) {
$converter->setConfig($this->config);
}
foreach ($converter->getSupportedTags() as $tag) {
$this->converters[$tag] = $converter;
}
}
/**
* @param string $tag
*
* @return ConverterInterface
*/
public function getConverterByTag($tag)
{
if (isset($this->converters[$tag])) {
return $this->converters[$tag];
}
return $this->converters[DefaultConverter::DEFAULT_CONVERTER];
}
/**
* @param array $config
*
* @return Environment
*/
public static function createDefaultEnvironment(array $config = array())
{
$environment = new static($config);
$environment->addConverter(new BlockquoteConverter());
$environment->addConverter(new CommentConverter());
$environment->addConverter(new DivConverter());
$environment->addConverter(new EmphasisConverter());
$environment->addConverter(new HardBreakConverter());
$environment->addConverter(new HeaderConverter());
$environment->addConverter(new HorizontalRuleConverter());
$environment->addConverter(new ImageConverter());
$environment->addConverter(new LinkConverter());
$environment->addConverter(new ListBlockConverter());
$environment->addConverter(new ListItemConverter());
$environment->addConverter(new ParagraphConverter());
$environment->addConverter(new PreformattedConverter());
$environment->addConverter(new TextConverter());
return $environment;
}
}

View file

@ -0,0 +1,194 @@
<?php
namespace League\HTMLToMarkdown;
/**
* Class HtmlConverter
*
* A helper class to convert HTML to Markdown.
*
* @author Colin O'Dell <colinodell@gmail.com>
* @author Nick Cernis <nick@cern.is>
*
* @link https://github.com/thephpleague/html-to-markdown/ Latest version on GitHub.
*
* @license http://www.opensource.org/licenses/mit-license.php MIT
*/
class HtmlConverter
{
/**
* @var Environment
*/
protected $environment;
/**
* Constructor
*
* @param array $options Configuration options
*/
public function __construct(array $options = array())
{
$defaults = array(
'header_style' => 'setext', // Set to 'atx' to output H1 and H2 headers as # Header1 and ## Header2
'suppress_errors' => true, // Set to false to show warnings when loading malformed HTML
'strip_tags' => false, // Set to true to strip tags that don't have markdown equivalents. N.B. Strips tags, not their content. Useful to clean MS Word HTML output.
'bold_style' => '**', // Set to '__' if you prefer the underlined style
'italic_style' => '*', // Set to '_' if you prefer the underlined style
'remove_nodes' => '', // space-separated list of dom nodes that should be removed. example: 'meta style script'
);
$this->environment = Environment::createDefaultEnvironment($defaults);
$this->environment->getConfig()->merge($options);
}
/**
* @return Environment
*/
public function getEnvironment()
{
return $this->environment;
}
/**
* @return Configuration
*/
public function getConfig()
{
return $this->environment->getConfig();
}
/**
* Convert
*
* Loads HTML and passes to getMarkdown()
*
* @param $html
*
* @return string The Markdown version of the html
*/
public function convert($html)
{
if (trim($html) === '') {
return '';
}
$document = $this->createDOMDocument($html);
// Work on the entire DOM tree (including head and body)
if (!($root = $document->getElementsByTagName('html')->item(0))) {
throw new \InvalidArgumentException('Invalid HTML was provided');
}
$rootElement = new Element($root);
$this->convertChildren($rootElement);
// Store the now-modified DOMDocument as a string
$markdown = $document->saveHTML();
$markdown = $this->sanitize($markdown);
return $markdown;
}
/**
* @param string $html
*
* @return \DOMDocument
*/
private function createDOMDocument($html)
{
$document = new \DOMDocument();
if ($this->getConfig()->getOption('suppress_errors')) {
// Suppress conversion errors (from http://bit.ly/pCCRSX)
libxml_use_internal_errors(true);
}
// Hack to load utf-8 HTML (from http://bit.ly/pVDyCt)
$document->loadHTML('<?xml encoding="UTF-8">' . $html);
$document->encoding = 'UTF-8';
if ($this->getConfig()->getOption('suppress_errors')) {
libxml_clear_errors();
}
return $document;
}
/**
* Convert Children
*
* Recursive function to drill into the DOM and convert each node into Markdown from the inside out.
*
* Finds children of each node and convert those to #text nodes containing their Markdown equivalent,
* starting with the innermost element and working up to the outermost element.
*
* @param ElementInterface $element
*/
private function convertChildren(ElementInterface $element)
{
// Don't convert HTML code inside <code> and <pre> blocks to Markdown - that should stay as HTML
if ($element->isDescendantOf(array('pre', 'code'))) {
return;
}
// If the node has children, convert those to Markdown first
if ($element->hasChildren()) {
foreach ($element->getChildren() as $child) {
$this->convertChildren($child);
}
}
// Now that child nodes have been converted, convert the original node
$markdown = $this->convertToMarkdown($element);
// Create a DOM text node containing the Markdown equivalent of the original node
// Replace the old $node e.g. '<h3>Title</h3>' with the new $markdown_node e.g. '### Title'
$element->setFinalMarkdown($markdown);
}
/**
* Convert to Markdown
*
* Converts an individual node into a #text node containing a string of its Markdown equivalent.
*
* Example: An <h3> node with text content of 'Title' becomes a text node with content of '### Title'
*
* @param ElementInterface $element
*
* @return string The converted HTML as Markdown
*/
protected function convertToMarkdown(ElementInterface $element)
{
$tag = $element->getTagName();
// Strip nodes named in remove_nodes
$tags_to_remove = explode(' ', $this->getConfig()->getOption('remove_nodes'));
if (in_array($tag, $tags_to_remove)) {
return false;
}
$converter = $this->environment->getConverterByTag($tag);
return $converter->convert($element);
}
/**
* @param string $markdown
*
* @return string
*/
protected function sanitize($markdown)
{
$markdown = html_entity_decode($markdown, ENT_QUOTES, 'UTF-8');
$markdown = html_entity_decode($markdown, ENT_QUOTES, 'UTF-8'); // Double decode to cover cases like &amp;nbsp; http://www.php.net/manual/en/function.htmlentities.php#99984
$markdown = preg_replace('/<!DOCTYPE [^>]+>/', '', $markdown); // Strip doctype declaration
$unwanted = array('<html>', '</html>', '<body>', '</body>', '<head>', '</head>', '<?xml encoding="UTF-8">', '&#xD;');
$markdown = str_replace($unwanted, '', $markdown); // Strip unwanted tags
$markdown = trim($markdown, "\n\r\0\x0B");
return $markdown;
}
}