switch to html2text() instead of strip_tags() when preparing FTS index
This commit is contained in:
parent
2b61052e87
commit
03e956132d
|
@ -90,7 +90,7 @@ class Article extends Handler_Protected {
|
|||
SET tsvector_combined = to_tsvector( :ts_content)
|
||||
WHERE id = :id");
|
||||
$params = [
|
||||
":ts_content" => mb_substr(strip_tags($content ), 0, 900000),
|
||||
":ts_content" => mb_substr(\Soundasleep\Html2Text::convert($content), 0, 900000),
|
||||
":id" => $ref_id];
|
||||
$sth->execute($params);
|
||||
}
|
||||
|
@ -135,7 +135,7 @@ class Article extends Handler_Protected {
|
|||
SET tsvector_combined = to_tsvector( :ts_content)
|
||||
WHERE id = :id");
|
||||
$params = [
|
||||
":ts_content" => mb_substr(strip_tags($content ), 0, 900000),
|
||||
":ts_content" => mb_substr(\Soundasleep\Html2Text::convert($content), 0, 900000),
|
||||
":id" => $ref_id];
|
||||
$sth->execute($params);
|
||||
}
|
||||
|
|
|
@ -1184,7 +1184,7 @@ class RSSUtils {
|
|||
|
||||
if (Config::get(Config::DB_TYPE) == "pgsql") {
|
||||
$params[":ts_lang"] = $feed_language;
|
||||
$params[":ts_content"] = mb_substr(strip_tags($entry_title . " " . $entry_content), 0, 900000);
|
||||
$params[":ts_content"] = mb_substr(strip_tags($entry_title) . " " . \Soundasleep\Html2Text::convert($entry_content), 0, 900000);
|
||||
}
|
||||
|
||||
$sth->execute($params);
|
||||
|
|
|
@ -18,7 +18,8 @@
|
|||
"mervick/material-design-icons": "^2.2",
|
||||
"j4mie/idiorm": "dev-master",
|
||||
"open-telemetry/exporter-otlp": "^1.0",
|
||||
"php-http/guzzle7-adapter": "^1.0"
|
||||
"php-http/guzzle7-adapter": "^1.0",
|
||||
"soundasleep/html2text": "^2.1"
|
||||
},
|
||||
"require-dev": {
|
||||
"phpstan/phpstan": "1.10.3",
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
|
||||
"This file is @generated automatically"
|
||||
],
|
||||
"content-hash": "2c8b76f35398131c362d125ed47c8102",
|
||||
"content-hash": "cbbbfbdbf1c5f659b8e34307411bc751",
|
||||
"packages": [
|
||||
{
|
||||
"name": "beberlei/assert",
|
||||
|
@ -1659,6 +1659,61 @@
|
|||
},
|
||||
"time": "2019-03-08T08:55:37+00:00"
|
||||
},
|
||||
{
|
||||
"name": "soundasleep/html2text",
|
||||
"version": "2.1.0",
|
||||
"source": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/soundasleep/html2text.git",
|
||||
"reference": "83502b6f8f1aaef8e2e238897199d64f284b4af3"
|
||||
},
|
||||
"dist": {
|
||||
"type": "zip",
|
||||
"url": "https://api.github.com/repos/soundasleep/html2text/zipball/83502b6f8f1aaef8e2e238897199d64f284b4af3",
|
||||
"reference": "83502b6f8f1aaef8e2e238897199d64f284b4af3",
|
||||
"shasum": ""
|
||||
},
|
||||
"require": {
|
||||
"ext-dom": "*",
|
||||
"ext-libxml": "*",
|
||||
"php": "^7.3|^8.0"
|
||||
},
|
||||
"require-dev": {
|
||||
"phpstan/phpstan": "^1.9",
|
||||
"phpunit/phpunit": "^7.0|^8.0|^9.0"
|
||||
},
|
||||
"type": "library",
|
||||
"autoload": {
|
||||
"psr-4": {
|
||||
"Soundasleep\\": "src"
|
||||
}
|
||||
},
|
||||
"notification-url": "https://packagist.org/downloads/",
|
||||
"license": [
|
||||
"MIT"
|
||||
],
|
||||
"authors": [
|
||||
{
|
||||
"name": "Jevon Wright",
|
||||
"homepage": "https://jevon.org",
|
||||
"role": "Developer"
|
||||
}
|
||||
],
|
||||
"description": "A PHP script to convert HTML into a plain text format",
|
||||
"homepage": "https://github.com/soundasleep/html2text",
|
||||
"keywords": [
|
||||
"email",
|
||||
"html",
|
||||
"php",
|
||||
"text"
|
||||
],
|
||||
"support": {
|
||||
"email": "support@jevon.org",
|
||||
"issues": "https://github.com/soundasleep/html2text/issues",
|
||||
"source": "https://github.com/soundasleep/html2text/tree/2.1.0"
|
||||
},
|
||||
"time": "2023-01-06T09:28:15+00:00"
|
||||
},
|
||||
{
|
||||
"name": "spomky-labs/otphp",
|
||||
"version": "v10.0.3",
|
||||
|
|
|
@ -317,7 +317,7 @@
|
|||
|
||||
while (true) {
|
||||
foreach ($entries as $entry) {
|
||||
$tsvector_combined = mb_substr(strip_tags($entry->title . " " . $entry->content), 0, 1000000);
|
||||
$tsvector_combined = mb_substr(strip_tags($entry->title) . " " . \Soundasleep\Html2Text::convert($entry->content), 0, 900000);
|
||||
$usth->execute([$tsvector_combined, $entry->id]);
|
||||
$processed++;
|
||||
}
|
||||
|
|
|
@ -14,8 +14,9 @@ return array(
|
|||
'Symfony\\Polyfill\\Php81\\' => array($vendorDir . '/symfony/polyfill-php81'),
|
||||
'Symfony\\Polyfill\\Php80\\' => array($vendorDir . '/symfony/polyfill-php80'),
|
||||
'Symfony\\Polyfill\\Mbstring\\' => array($vendorDir . '/symfony/polyfill-mbstring'),
|
||||
'Soundasleep\\' => array($vendorDir . '/soundasleep/html2text/src'),
|
||||
'Psr\\Log\\' => array($vendorDir . '/psr/log/src'),
|
||||
'Psr\\Http\\Message\\' => array($vendorDir . '/psr/http-message/src', $vendorDir . '/psr/http-factory/src'),
|
||||
'Psr\\Http\\Message\\' => array($vendorDir . '/psr/http-factory/src', $vendorDir . '/psr/http-message/src'),
|
||||
'Psr\\Http\\Client\\' => array($vendorDir . '/psr/http-client/src'),
|
||||
'Prophecy\\' => array($vendorDir . '/phpspec/prophecy/src/Prophecy'),
|
||||
'PhpParser\\' => array($vendorDir . '/nikic/php-parser/lib/PhpParser'),
|
||||
|
|
|
@ -137,6 +137,7 @@ class ComposerStaticInit19fc2ff1c0f9a92279c7979386bb2056
|
|||
'Symfony\\Polyfill\\Php81\\' => 23,
|
||||
'Symfony\\Polyfill\\Php80\\' => 23,
|
||||
'Symfony\\Polyfill\\Mbstring\\' => 26,
|
||||
'Soundasleep\\' => 12,
|
||||
),
|
||||
'P' =>
|
||||
array (
|
||||
|
@ -219,14 +220,18 @@ class ComposerStaticInit19fc2ff1c0f9a92279c7979386bb2056
|
|||
array (
|
||||
0 => __DIR__ . '/..' . '/symfony/polyfill-mbstring',
|
||||
),
|
||||
'Soundasleep\\' =>
|
||||
array (
|
||||
0 => __DIR__ . '/..' . '/soundasleep/html2text/src',
|
||||
),
|
||||
'Psr\\Log\\' =>
|
||||
array (
|
||||
0 => __DIR__ . '/..' . '/psr/log/src',
|
||||
),
|
||||
'Psr\\Http\\Message\\' =>
|
||||
array (
|
||||
0 => __DIR__ . '/..' . '/psr/http-message/src',
|
||||
1 => __DIR__ . '/..' . '/psr/http-factory/src',
|
||||
0 => __DIR__ . '/..' . '/psr/http-factory/src',
|
||||
1 => __DIR__ . '/..' . '/psr/http-message/src',
|
||||
),
|
||||
'Psr\\Http\\Client\\' =>
|
||||
array (
|
||||
|
|
|
@ -3791,6 +3791,64 @@
|
|||
],
|
||||
"install-path": "../sebastian/version"
|
||||
},
|
||||
{
|
||||
"name": "soundasleep/html2text",
|
||||
"version": "2.1.0",
|
||||
"version_normalized": "2.1.0.0",
|
||||
"source": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/soundasleep/html2text.git",
|
||||
"reference": "83502b6f8f1aaef8e2e238897199d64f284b4af3"
|
||||
},
|
||||
"dist": {
|
||||
"type": "zip",
|
||||
"url": "https://api.github.com/repos/soundasleep/html2text/zipball/83502b6f8f1aaef8e2e238897199d64f284b4af3",
|
||||
"reference": "83502b6f8f1aaef8e2e238897199d64f284b4af3",
|
||||
"shasum": ""
|
||||
},
|
||||
"require": {
|
||||
"ext-dom": "*",
|
||||
"ext-libxml": "*",
|
||||
"php": "^7.3|^8.0"
|
||||
},
|
||||
"require-dev": {
|
||||
"phpstan/phpstan": "^1.9",
|
||||
"phpunit/phpunit": "^7.0|^8.0|^9.0"
|
||||
},
|
||||
"time": "2023-01-06T09:28:15+00:00",
|
||||
"type": "library",
|
||||
"installation-source": "dist",
|
||||
"autoload": {
|
||||
"psr-4": {
|
||||
"Soundasleep\\": "src"
|
||||
}
|
||||
},
|
||||
"notification-url": "https://packagist.org/downloads/",
|
||||
"license": [
|
||||
"MIT"
|
||||
],
|
||||
"authors": [
|
||||
{
|
||||
"name": "Jevon Wright",
|
||||
"homepage": "https://jevon.org",
|
||||
"role": "Developer"
|
||||
}
|
||||
],
|
||||
"description": "A PHP script to convert HTML into a plain text format",
|
||||
"homepage": "https://github.com/soundasleep/html2text",
|
||||
"keywords": [
|
||||
"email",
|
||||
"html",
|
||||
"php",
|
||||
"text"
|
||||
],
|
||||
"support": {
|
||||
"email": "support@jevon.org",
|
||||
"issues": "https://github.com/soundasleep/html2text/issues",
|
||||
"source": "https://github.com/soundasleep/html2text/tree/2.1.0"
|
||||
},
|
||||
"install-path": "../soundasleep/html2text"
|
||||
},
|
||||
{
|
||||
"name": "spomky-labs/otphp",
|
||||
"version": "v10.0.3",
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
'name' => '__root__',
|
||||
'pretty_version' => 'dev-master',
|
||||
'version' => 'dev-master',
|
||||
'reference' => '45a9ff0c88cbd33892ff16ab837e9059937d656e',
|
||||
'reference' => '2b61052e8709283d89997e351173bcb43a3c2c61',
|
||||
'type' => 'library',
|
||||
'install_path' => __DIR__ . '/../../',
|
||||
'aliases' => array(),
|
||||
|
@ -13,7 +13,7 @@
|
|||
'__root__' => array(
|
||||
'pretty_version' => 'dev-master',
|
||||
'version' => 'dev-master',
|
||||
'reference' => '45a9ff0c88cbd33892ff16ab837e9059937d656e',
|
||||
'reference' => '2b61052e8709283d89997e351173bcb43a3c2c61',
|
||||
'type' => 'library',
|
||||
'install_path' => __DIR__ . '/../../',
|
||||
'aliases' => array(),
|
||||
|
@ -371,8 +371,8 @@
|
|||
'psr/http-client-implementation' => array(
|
||||
'dev_requirement' => false,
|
||||
'provided' => array(
|
||||
0 => '*',
|
||||
1 => '1.0',
|
||||
0 => '1.0',
|
||||
1 => '*',
|
||||
),
|
||||
),
|
||||
'psr/http-factory' => array(
|
||||
|
@ -387,8 +387,8 @@
|
|||
'psr/http-factory-implementation' => array(
|
||||
'dev_requirement' => false,
|
||||
'provided' => array(
|
||||
0 => '*',
|
||||
1 => '1.0',
|
||||
0 => '1.0',
|
||||
1 => '*',
|
||||
),
|
||||
),
|
||||
'psr/http-message' => array(
|
||||
|
@ -403,8 +403,8 @@
|
|||
'psr/http-message-implementation' => array(
|
||||
'dev_requirement' => false,
|
||||
'provided' => array(
|
||||
0 => '*',
|
||||
1 => '1.0',
|
||||
0 => '1.0',
|
||||
1 => '*',
|
||||
),
|
||||
),
|
||||
'psr/log' => array(
|
||||
|
@ -569,6 +569,15 @@
|
|||
'aliases' => array(),
|
||||
'dev_requirement' => true,
|
||||
),
|
||||
'soundasleep/html2text' => array(
|
||||
'pretty_version' => '2.1.0',
|
||||
'version' => '2.1.0.0',
|
||||
'reference' => '83502b6f8f1aaef8e2e238897199d64f284b4af3',
|
||||
'type' => 'library',
|
||||
'install_path' => __DIR__ . '/../soundasleep/html2text',
|
||||
'aliases' => array(),
|
||||
'dev_requirement' => false,
|
||||
),
|
||||
'spomky-labs/otphp' => array(
|
||||
'pretty_version' => 'v10.0.3',
|
||||
'version' => '10.0.3.0',
|
||||
|
|
|
@ -0,0 +1,23 @@
|
|||
# EditorConfig is awesome: http://EditorConfig.org
|
||||
|
||||
# top-most EditorConfig file
|
||||
root = true
|
||||
|
||||
# Unix-style newlines with a newline ending every file
|
||||
[*]
|
||||
end_of_line = lf
|
||||
charset = utf-8
|
||||
insert_final_newline = true
|
||||
trim_trailing_whitespace = true
|
||||
indent_style = tab
|
||||
indent_size = 4
|
||||
|
||||
[*.md]
|
||||
indent_style = space
|
||||
indent_size = 2
|
||||
|
||||
# don't add newlines to test files
|
||||
[tests/*]
|
||||
indent_style = tabs
|
||||
trim_trailing_whitespace = false
|
||||
insert_final_newline = false
|
|
@ -0,0 +1,17 @@
|
|||
name: Lint
|
||||
on:
|
||||
- push
|
||||
jobs:
|
||||
lint:
|
||||
name: Lint
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
- name: Setup PHP
|
||||
uses: shivammathur/setup-php@v2
|
||||
with:
|
||||
php-version: '7.4'
|
||||
tools: phplint
|
||||
- name: Check syntax
|
||||
run: phplint .
|
|
@ -0,0 +1,41 @@
|
|||
name: Test
|
||||
on:
|
||||
- push
|
||||
jobs:
|
||||
test:
|
||||
strategy:
|
||||
matrix:
|
||||
operating-system:
|
||||
- ubuntu-latest
|
||||
php-version:
|
||||
- '7.3'
|
||||
- '7.4'
|
||||
- '8.0'
|
||||
- '8.1'
|
||||
- '8.2'
|
||||
name: php ${{ matrix.php-version }} on ${{ matrix.operating-system }}
|
||||
runs-on: ${{ matrix.operating-system }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
- name: Setup PHP
|
||||
uses: shivammathur/setup-php@v2
|
||||
with:
|
||||
php-version: ${{ matrix.php-version }}
|
||||
extensions: mbstring
|
||||
coverage: none
|
||||
- name: Get composer cache directory
|
||||
id: composer-cache
|
||||
run: echo "::set-output name=dir::$(composer config cache-files-dir)"
|
||||
- name: Setup composer cache
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: ${{ steps.composer-cache.outputs.dir }}
|
||||
key: ${{ runner.os }}-composer-${{ hashFiles('**/composer.lock') }}
|
||||
restore-keys: ${{ runner.os }}-composer-
|
||||
- name: Install composer dependencies
|
||||
env:
|
||||
COMPOSER_AUTH: ${{ secrets.COMPOSER_AUTH }}
|
||||
run: composer install --no-ansi --no-interaction --no-scripts --no-progress --prefer-dist
|
||||
- name: Run tests
|
||||
run: vendor/bin/phpunit
|
|
@ -0,0 +1,7 @@
|
|||
tests/*.output
|
||||
*.sublime-project
|
||||
*.sublime-workspace
|
||||
vendor/
|
||||
**/*.DS_Store
|
||||
.phpunit.result.cache
|
||||
composer.lock
|
|
@ -0,0 +1,37 @@
|
|||
# Changelog
|
||||
All notable changes to this project will be documented in this file.
|
||||
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
## [1.1.0] - 2019-02-15
|
||||
### Added
|
||||
- Zero-width non-joiners are now stripped to prevent output issues, similar to non-breaking whitespace
|
||||
|
||||
### Fixed
|
||||
- Fix namespace in composer [#67](https://github.com/soundasleep/html2text/pull/67)
|
||||
|
||||
## [1.0.0] - 2019-02-14
|
||||
### Added
|
||||
- Added `drop_links` option to render links without the target href [#65](https://github.com/soundasleep/html2text/pull/65)
|
||||
|
||||
### Changed
|
||||
- **Important:** Changed namespace from `\Html2Text\Html2Text` to `\Soundasleep\Html2text` [#45](https://github.com/soundasleep/html2text/issues/45)
|
||||
- Treat non-breaking spaces consistently: never include them in output text [#64](https://github.com/soundasleep/html2text/pull/64)
|
||||
- Second argument to `convert()` is now an array, rather than boolean [#65](https://github.com/soundasleep/html2text/pull/65)
|
||||
- Optimise/improve newline & whitespace handling [#47](https://github.com/soundasleep/html2text/pull/47)
|
||||
- Upgrade PHP support to PHP 7.3+
|
||||
- Upgrade PHPUnit to 7.x
|
||||
- Re-release project under MIT license [#58](https://github.com/soundasleep/html2text/issues/58)
|
||||
|
||||
## [0.5.0] - 2017-04-20
|
||||
### Added
|
||||
- Add ignore_error optional argument [#63](https://github.com/soundasleep/html2text/pull/63)
|
||||
- Blockquote support [#50](https://github.com/soundasleep/html2text/pull/50)
|
||||
|
||||
[Unreleased]: https://github.com/soundasleep/html2text/compare/1.1.0...HEAD
|
||||
[1.1.0]: https://github.com/soundasleep/html2text/compare/1.0.0...1.1.0
|
||||
[1.0.0]: https://github.com/soundasleep/html2text/compare/0.5.0...1.0.0
|
||||
[0.5.0]: https://github.com/soundasleep/html2text/compare/0.5.0...0.3.4
|
|
@ -0,0 +1,21 @@
|
|||
MIT License
|
||||
|
||||
Copyright (c) 2019 Jevon Wright
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
|
@ -0,0 +1,102 @@
|
|||
![example workflow](https://github.com/soundasleep/html2text/actions/workflows/test.yml/badge.svg) [![Total Downloads](https://poser.pugx.org/soundasleep/html2text/downloads.png)](https://packagist.org/packages/soundasleep/html2text)
|
||||
=========
|
||||
|
||||
html2text is a very simple script that uses DOM methods to convert HTML into a format similar to what would be
|
||||
rendered by a browser - perfect for places where you need a quick text representation. For example:
|
||||
|
||||
```html
|
||||
<html>
|
||||
<title>Ignored Title</title>
|
||||
<body>
|
||||
<h1>Hello, World!</h1>
|
||||
|
||||
<p>This is some e-mail content.
|
||||
Even though it has whitespace and newlines, the e-mail converter
|
||||
will handle it correctly.
|
||||
|
||||
<p>Even mismatched tags.</p>
|
||||
|
||||
<div>A div</div>
|
||||
<div>Another div</div>
|
||||
<div>A div<div>within a div</div></div>
|
||||
|
||||
<a href="http://foo.com">A link</a>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
```
|
||||
|
||||
Will be converted into:
|
||||
|
||||
```text
|
||||
Hello, World!
|
||||
|
||||
This is some e-mail content. Even though it has whitespace and newlines, the e-mail converter will handle it correctly.
|
||||
|
||||
Even mismatched tags.
|
||||
|
||||
A div
|
||||
Another div
|
||||
A div
|
||||
within a div
|
||||
|
||||
[A link](http://foo.com)
|
||||
```
|
||||
|
||||
See the [original blog post](http://journals.jevon.org/users/jevon-phd/entry/19818) or the related [StackOverflow answer](http://stackoverflow.com/a/2564472/39531).
|
||||
|
||||
## Installing
|
||||
|
||||
You can use [Composer](http://getcomposer.org/) to add the [package](https://packagist.org/packages/soundasleep/html2text) to your project:
|
||||
|
||||
```json
|
||||
{
|
||||
"require": {
|
||||
"soundasleep/html2text": "~1.1"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
And then use it quite simply:
|
||||
|
||||
```php
|
||||
$text = \Soundasleep\Html2Text::convert($html);
|
||||
```
|
||||
|
||||
You can also include the supplied `html2text.php` and use `$text = convert_html_to_text($html);` instead.
|
||||
|
||||
### Options
|
||||
|
||||
| Option | Default | Description |
|
||||
|--------|---------|-------------|
|
||||
| **ignore_errors** | `false` | Set to `true` to ignore any XML parsing errors. |
|
||||
| **drop_links** | `false` | Set to `true` to not render links as `[http://foo.com](My Link)`, but rather just `My Link`. |
|
||||
| **char_set** | `'auto'` | Specify a specific character set. Pass multiple character sets (comma separated) to detect encoding, default is ASCII,UTF-8 |
|
||||
|
||||
Pass along options as a second argument to `convert`, for example:
|
||||
|
||||
```php
|
||||
$options = array(
|
||||
'ignore_errors' => true,
|
||||
// other options go here
|
||||
);
|
||||
$text = \Soundasleep\Html2Text::convert($html, $options);
|
||||
```
|
||||
|
||||
## Tests
|
||||
|
||||
Some very basic tests are provided in the `tests/` directory. Run them with `composer install && vendor/bin/phpunit`.
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Class 'DOMDocument' not found
|
||||
|
||||
You need to [install the PHP XML extension](https://github.com/soundasleep/html2text/issues/55) for your PHP version. e.g. `apt-get install php7.4-xml`
|
||||
|
||||
## License
|
||||
|
||||
`html2text` is [licensed under MIT](LICENSE.md), making it suitable for both Eclipse and GPL projects.
|
||||
|
||||
## Other versions
|
||||
|
||||
Also see [html2text_ruby](https://github.com/soundasleep/html2text_ruby), a Ruby implementation.
|
|
@ -0,0 +1,32 @@
|
|||
{
|
||||
"name": "soundasleep/html2text",
|
||||
"description": "A PHP script to convert HTML into a plain text format",
|
||||
"type": "library",
|
||||
"keywords": [ "php", "html", "text", "email" ],
|
||||
"homepage": "https://github.com/soundasleep/html2text",
|
||||
"license": "MIT",
|
||||
"authors": [
|
||||
{
|
||||
"name": "Jevon Wright",
|
||||
"homepage": "https://jevon.org",
|
||||
"role": "Developer"
|
||||
}
|
||||
],
|
||||
"autoload": {
|
||||
"psr-4": {
|
||||
"Soundasleep\\": "src"
|
||||
}
|
||||
},
|
||||
"support": {
|
||||
"email": "support@jevon.org"
|
||||
},
|
||||
"require": {
|
||||
"php": "^7.3|^8.0",
|
||||
"ext-dom": "*",
|
||||
"ext-libxml": "*"
|
||||
},
|
||||
"require-dev": {
|
||||
"phpunit/phpunit": "^7.0|^8.0|^9.0",
|
||||
"phpstan/phpstan": "^1.9"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,21 @@
|
|||
<?php
|
||||
/**
|
||||
* This file allows you to convert through the command line.
|
||||
* Usage:
|
||||
* php -f convert.php [input file]
|
||||
*/
|
||||
|
||||
if (count($argv) < 2) {
|
||||
throw new \InvalidArgumentException("Expected: php -f convert.php [input file]");
|
||||
}
|
||||
|
||||
if (!file_exists($argv[1])) {
|
||||
throw new \InvalidArgumentException("'" . $argv[1] . "' does not exist");
|
||||
}
|
||||
|
||||
$input = file_get_contents($argv[1]);
|
||||
|
||||
require_once(__DIR__ . "/src/Html2Text.php");
|
||||
require_once(__DIR__ . "/src/Html2TextException.php");
|
||||
|
||||
echo \Soundasleep\Html2Text::convert($input);
|
|
@ -0,0 +1,16 @@
|
|||
<?php
|
||||
/**
|
||||
* This file is available if you still want to use functions rather than
|
||||
* autoloading classes.
|
||||
*/
|
||||
|
||||
require_once(__DIR__ . "/src/Html2Text.php");
|
||||
require_once(__DIR__ . "/src/Html2TextException.php");
|
||||
|
||||
function convert_html_to_text($html, $ignore_error = false) {
|
||||
return Soundasleep\Html2Text::convert($html, $ignore_error);
|
||||
}
|
||||
|
||||
function fix_newlines($text) {
|
||||
return Soundasleep\Html2Text::fixNewlines($text);
|
||||
}
|
|
@ -0,0 +1,7 @@
|
|||
parameters:
|
||||
level: 6
|
||||
errorFormat: raw
|
||||
editorUrl: '%%file%% %%line%% %%column%%: %%error%%'
|
||||
paths:
|
||||
- src
|
||||
- tests
|
|
@ -0,0 +1,8 @@
|
|||
<phpunit stopOnFailure="true" stopOnError="true" beStrictAboutTestsThatDoNotTestAnything="false">
|
||||
<testsuites>
|
||||
<testsuite name="Tests">
|
||||
<!-- loads all *Test.php -->
|
||||
<directory>tests</directory>
|
||||
</testsuite>
|
||||
</testsuites>
|
||||
</phpunit>
|
|
@ -0,0 +1,540 @@
|
|||
<?php
|
||||
|
||||
namespace Soundasleep;
|
||||
|
||||
class Html2Text {
|
||||
|
||||
/** @return array<string, bool | string> */
|
||||
public static function defaultOptions(): array {
|
||||
return [
|
||||
'ignore_errors' => false,
|
||||
'drop_links' => false,
|
||||
'char_set' => 'auto'
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* Tries to convert the given HTML into a plain text format - best suited for
|
||||
* e-mail display, etc.
|
||||
*
|
||||
* <p>In particular, it tries to maintain the following features:
|
||||
* <ul>
|
||||
* <li>Links are maintained, with the 'href' copied over
|
||||
* <li>Information in the <head> is lost
|
||||
* </ul>
|
||||
*
|
||||
* @param string $html the input HTML
|
||||
* @param boolean|array<string, bool | string> $options if boolean, Ignore xml parsing errors, else ['ignore_errors' => false, 'drop_links' => false, 'char_set' => 'auto']
|
||||
* @return string the HTML converted, as best as possible, to text
|
||||
* @throws Html2TextException if the HTML could not be loaded as a {@link \DOMDocument}
|
||||
*/
|
||||
public static function convert(string $html, $options = []): string {
|
||||
|
||||
if ($options === false || $options === true) {
|
||||
// Using old style (< 1.0) of passing in options
|
||||
$options = ['ignore_errors' => $options];
|
||||
}
|
||||
|
||||
$options = array_merge(static::defaultOptions(), $options);
|
||||
|
||||
// check all options are valid
|
||||
foreach ($options as $key => $value) {
|
||||
if (!in_array($key, array_keys(static::defaultOptions()))) {
|
||||
throw new \InvalidArgumentException("Unknown html2text option '$key'. Valid options are " . implode(',', static::defaultOptions()));
|
||||
}
|
||||
}
|
||||
|
||||
$is_office_document = self::isOfficeDocument($html);
|
||||
|
||||
if ($is_office_document) {
|
||||
// remove office namespace
|
||||
$html = str_replace(["<o:p>", "</o:p>"], "", $html);
|
||||
}
|
||||
|
||||
$html = self::fixNewlines($html);
|
||||
|
||||
// use mb_convert_encoding for legacy versions of php
|
||||
if (PHP_MAJOR_VERSION * 10 + PHP_MINOR_VERSION < 81 && mb_detect_encoding($html, "UTF-8", true)) {
|
||||
$html = mb_convert_encoding($html, "HTML-ENTITIES", "UTF-8");
|
||||
}
|
||||
|
||||
$doc = self::getDocument($html, $options);
|
||||
|
||||
$output = self::iterateOverNode($doc, null, false, $is_office_document, $options);
|
||||
|
||||
// process output for whitespace/newlines
|
||||
$output = self::processWhitespaceNewlines($output);
|
||||
|
||||
return $output;
|
||||
}
|
||||
|
||||
/**
|
||||
* Unify newlines; in particular, \r\n becomes \n, and
|
||||
* then \r becomes \n. This means that all newlines (Unix, Windows, Mac)
|
||||
* all become \ns.
|
||||
*
|
||||
* @param string $text text with any number of \r, \r\n and \n combinations
|
||||
* @return string the fixed text
|
||||
*/
|
||||
public static function fixNewlines(string $text): string {
|
||||
// replace \r\n to \n
|
||||
$text = str_replace("\r\n", "\n", $text);
|
||||
// remove \rs
|
||||
$text = str_replace("\r", "\n", $text);
|
||||
|
||||
return $text;
|
||||
}
|
||||
|
||||
/** @return array<string> */
|
||||
public static function nbspCodes(): array {
|
||||
return [
|
||||
"\xc2\xa0",
|
||||
"\u00a0",
|
||||
];
|
||||
}
|
||||
|
||||
/** @return array<string> */
|
||||
public static function zwnjCodes(): array {
|
||||
return [
|
||||
"\xe2\x80\x8c",
|
||||
"\u200c",
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove leading or trailing spaces and excess empty lines from provided multiline text
|
||||
*
|
||||
* @param string $text multiline text any number of leading or trailing spaces or excess lines
|
||||
* @return string the fixed text
|
||||
*/
|
||||
public static function processWhitespaceNewlines(string $text): string {
|
||||
|
||||
// remove excess spaces around tabs
|
||||
$text = preg_replace("/ *\t */im", "\t", $text);
|
||||
|
||||
// remove leading whitespace
|
||||
$text = ltrim($text);
|
||||
|
||||
// remove leading spaces on each line
|
||||
$text = preg_replace("/\n[ \t]*/im", "\n", $text);
|
||||
|
||||
// convert non-breaking spaces to regular spaces to prevent output issues,
|
||||
// do it here so they do NOT get removed with other leading spaces, as they
|
||||
// are sometimes used for indentation
|
||||
$text = self::renderText($text);
|
||||
|
||||
// remove trailing whitespace
|
||||
$text = rtrim($text);
|
||||
|
||||
// remove trailing spaces on each line
|
||||
$text = preg_replace("/[ \t]*\n/im", "\n", $text);
|
||||
|
||||
// unarmor pre blocks
|
||||
$text = self::fixNewLines($text);
|
||||
|
||||
// remove unnecessary empty lines
|
||||
$text = preg_replace("/\n\n\n*/im", "\n\n", $text);
|
||||
|
||||
return $text;
|
||||
}
|
||||
|
||||
/**
|
||||
* Can we guess that this HTML is generated by Microsoft Office?
|
||||
*/
|
||||
public static function isOfficeDocument(string $html): bool {
|
||||
return strpos($html, "urn:schemas-microsoft-com:office") !== false;
|
||||
}
|
||||
|
||||
public static function isWhitespace(string $text): bool {
|
||||
return strlen(trim(self::renderText($text), "\n\r\t ")) === 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse HTML into a DOMDocument
|
||||
*
|
||||
* @param string $html the input HTML
|
||||
* @param array<string, bool | string> $options
|
||||
* @return \DOMDocument the parsed document tree
|
||||
*/
|
||||
private static function getDocument(string $html, array $options): \DOMDocument {
|
||||
|
||||
$doc = new \DOMDocument();
|
||||
|
||||
$html = trim($html);
|
||||
|
||||
if (!$html) {
|
||||
// DOMDocument doesn't support empty value and throws an error
|
||||
// Return empty document instead
|
||||
return $doc;
|
||||
}
|
||||
|
||||
if ($html[0] !== '<') {
|
||||
// If HTML does not begin with a tag, we put a body tag around it.
|
||||
// If we do not do this, PHP will insert a paragraph tag around
|
||||
// the first block of text for some reason which can mess up
|
||||
// the newlines. See pre.html test for an example.
|
||||
$html = '<body>' . $html . '</body>';
|
||||
}
|
||||
|
||||
$header = '';
|
||||
// use char sets for modern versions of php
|
||||
if (PHP_MAJOR_VERSION * 10 + PHP_MINOR_VERSION >= 81) {
|
||||
// use specified char_set, or auto detect if not set
|
||||
$char_set = ! empty($options['char_set']) ? $options['char_set'] : 'auto';
|
||||
if ('auto' === $char_set) {
|
||||
$char_set = mb_detect_encoding($html);
|
||||
} else if (strpos($char_set, ',')) {
|
||||
mb_detect_order($char_set);
|
||||
$char_set = mb_detect_encoding($html);
|
||||
}
|
||||
// turn off error detection for Windows-1252 legacy html
|
||||
if (strpos($char_set, '1252')) {
|
||||
$options['ignore_errors'] = true;
|
||||
}
|
||||
$header = '<?xml version="1.0" encoding="' . $char_set . '">';
|
||||
}
|
||||
|
||||
if (! empty($options['ignore_errors'])) {
|
||||
$doc->strictErrorChecking = false;
|
||||
$doc->recover = true;
|
||||
$doc->xmlStandalone = true;
|
||||
$old_internal_errors = libxml_use_internal_errors(true);
|
||||
$load_result = $doc->loadHTML($header . $html, LIBXML_NOWARNING | LIBXML_NOERROR | LIBXML_NONET | LIBXML_PARSEHUGE);
|
||||
libxml_use_internal_errors($old_internal_errors);
|
||||
}
|
||||
else {
|
||||
$load_result = $doc->loadHTML($header . $html);
|
||||
}
|
||||
|
||||
if (!$load_result) {
|
||||
throw new Html2TextException("Could not load HTML - badly formed?", $html);
|
||||
}
|
||||
|
||||
return $doc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Replace any special characters with simple text versions, to prevent output issues:
|
||||
* - Convert non-breaking spaces to regular spaces; and
|
||||
* - Convert zero-width non-joiners to '' (nothing).
|
||||
*
|
||||
* This is to match our goal of rendering documents as they would be rendered
|
||||
* by a browser.
|
||||
*/
|
||||
private static function renderText(string $text): string {
|
||||
$text = str_replace(self::nbspCodes(), " ", $text);
|
||||
$text = str_replace(self::zwnjCodes(), "", $text);
|
||||
return $text;
|
||||
}
|
||||
|
||||
private static function nextChildName(?\DOMNode $node): ?string {
|
||||
// get the next child
|
||||
$nextNode = $node->nextSibling;
|
||||
while ($nextNode != null) {
|
||||
if ($nextNode instanceof \DOMText) {
|
||||
if (!self::isWhitespace($nextNode->wholeText)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ($nextNode instanceof \DOMElement) {
|
||||
break;
|
||||
}
|
||||
|
||||
$nextNode = $nextNode->nextSibling;
|
||||
}
|
||||
|
||||
$nextName = null;
|
||||
if (($nextNode instanceof \DOMElement || $nextNode instanceof \DOMText) && $nextNode != null) {
|
||||
$nextName = strtolower($nextNode->nodeName);
|
||||
}
|
||||
|
||||
return $nextName;
|
||||
}
|
||||
|
||||
/** @param array<string, bool | string> $options */
|
||||
private static function iterateOverNode(\DOMNode $node, ?string $prevName, bool $in_pre, bool $is_office_document, array $options): string {
|
||||
if ($node instanceof \DOMText) {
|
||||
// Replace whitespace characters with a space (equivilant to \s)
|
||||
if ($in_pre) {
|
||||
$text = "\n" . trim(self::renderText($node->wholeText), "\n\r\t ") . "\n";
|
||||
|
||||
// Remove trailing whitespace only
|
||||
$text = preg_replace("/[ \t]*\n/im", "\n", $text);
|
||||
|
||||
// armor newlines with \r.
|
||||
return str_replace("\n", "\r", $text);
|
||||
|
||||
}
|
||||
$text = self::renderText($node->wholeText);
|
||||
$text = preg_replace("/[\\t\\n\\f\\r ]+/im", " ", $text);
|
||||
|
||||
if (!self::isWhitespace($text) && ($prevName == 'p' || $prevName == 'div')) {
|
||||
return "\n" . $text;
|
||||
}
|
||||
return $text;
|
||||
}
|
||||
|
||||
if ($node instanceof \DOMDocumentType || $node instanceof \DOMProcessingInstruction) {
|
||||
// ignore
|
||||
return "";
|
||||
}
|
||||
|
||||
$name = strtolower($node->nodeName);
|
||||
$nextName = self::nextChildName($node);
|
||||
|
||||
// start whitespace
|
||||
switch ($name) {
|
||||
case "hr":
|
||||
$prefix = '';
|
||||
if ($prevName != null) {
|
||||
$prefix = "\n";
|
||||
}
|
||||
return $prefix . "---------------------------------------------------------------\n";
|
||||
|
||||
case "style":
|
||||
case "head":
|
||||
case "title":
|
||||
case "meta":
|
||||
case "script":
|
||||
// ignore these tags
|
||||
return "";
|
||||
|
||||
case "h1":
|
||||
case "h2":
|
||||
case "h3":
|
||||
case "h4":
|
||||
case "h5":
|
||||
case "h6":
|
||||
case "ol":
|
||||
case "ul":
|
||||
case "pre":
|
||||
// add two newlines
|
||||
$output = "\n\n";
|
||||
break;
|
||||
|
||||
case "td":
|
||||
case "th":
|
||||
// add tab char to separate table fields
|
||||
$output = "\t";
|
||||
break;
|
||||
|
||||
case "p":
|
||||
// Microsoft exchange emails often include HTML which, when passed through
|
||||
// html2text, results in lots of double line returns everywhere.
|
||||
//
|
||||
// To fix this, for any p element with a className of `MsoNormal` (the standard
|
||||
// classname in any Microsoft export or outlook for a paragraph that behaves
|
||||
// like a line return) we skip the first line returns and set the name to br.
|
||||
// @phpstan-ignore-next-line
|
||||
if ($is_office_document && $node->getAttribute('class') == 'MsoNormal') {
|
||||
$output = "";
|
||||
$name = 'br';
|
||||
break;
|
||||
}
|
||||
|
||||
// add two lines
|
||||
$output = "\n\n";
|
||||
break;
|
||||
|
||||
case "tr":
|
||||
// add one line
|
||||
$output = "\n";
|
||||
break;
|
||||
|
||||
case "div":
|
||||
$output = "";
|
||||
if ($prevName !== null) {
|
||||
// add one line
|
||||
$output .= "\n";
|
||||
}
|
||||
break;
|
||||
|
||||
case "li":
|
||||
$output = "- ";
|
||||
break;
|
||||
|
||||
default:
|
||||
// print out contents of unknown tags
|
||||
$output = "";
|
||||
break;
|
||||
}
|
||||
|
||||
// debug
|
||||
//$output .= "[$name,$nextName]";
|
||||
|
||||
if (isset($node->childNodes)) {
|
||||
|
||||
$n = $node->childNodes->item(0);
|
||||
$previousSiblingNames = [];
|
||||
$previousSiblingName = null;
|
||||
|
||||
$parts = [];
|
||||
$trailing_whitespace = 0;
|
||||
|
||||
while ($n != null) {
|
||||
|
||||
$text = self::iterateOverNode($n, $previousSiblingName, $in_pre || $name == 'pre', $is_office_document, $options);
|
||||
|
||||
// Pass current node name to next child, as previousSibling does not appear to get populated
|
||||
if ($n instanceof \DOMDocumentType
|
||||
|| $n instanceof \DOMProcessingInstruction
|
||||
|| ($n instanceof \DOMText && self::isWhitespace($text))) {
|
||||
// Keep current previousSiblingName, these are invisible
|
||||
$trailing_whitespace++;
|
||||
}
|
||||
else {
|
||||
$previousSiblingName = strtolower($n->nodeName);
|
||||
$previousSiblingNames[] = $previousSiblingName;
|
||||
$trailing_whitespace = 0;
|
||||
}
|
||||
|
||||
$node->removeChild($n);
|
||||
$n = $node->childNodes->item(0);
|
||||
|
||||
$parts[] = $text;
|
||||
}
|
||||
|
||||
// Remove trailing whitespace, important for the br check below
|
||||
while ($trailing_whitespace-- > 0) {
|
||||
array_pop($parts);
|
||||
}
|
||||
|
||||
// suppress last br tag inside a node list if follows text
|
||||
$last_name = array_pop($previousSiblingNames);
|
||||
if ($last_name === 'br') {
|
||||
$last_name = array_pop($previousSiblingNames);
|
||||
if ($last_name === '#text') {
|
||||
array_pop($parts);
|
||||
}
|
||||
}
|
||||
|
||||
$output .= implode('', $parts);
|
||||
}
|
||||
|
||||
// end whitespace
|
||||
switch ($name) {
|
||||
case "h1":
|
||||
case "h2":
|
||||
case "h3":
|
||||
case "h4":
|
||||
case "h5":
|
||||
case "h6":
|
||||
case "pre":
|
||||
case "p":
|
||||
// add two lines
|
||||
$output .= "\n\n";
|
||||
break;
|
||||
|
||||
case "br":
|
||||
// add one line
|
||||
$output .= "\n";
|
||||
break;
|
||||
|
||||
case "div":
|
||||
break;
|
||||
|
||||
case "a":
|
||||
// links are returned in [text](link) format
|
||||
// @phpstan-ignore-next-line
|
||||
$href = $node->getAttribute("href");
|
||||
|
||||
$output = trim($output);
|
||||
|
||||
// remove double [[ ]] s from linking images
|
||||
if (substr($output, 0, 1) == "[" && substr($output, -1) == "]") {
|
||||
$output = substr($output, 1, strlen($output) - 2);
|
||||
|
||||
// for linking images, the title of the <a> overrides the title of the <img>
|
||||
// @phpstan-ignore-next-line
|
||||
if ($node->getAttribute("title")) {
|
||||
// @phpstan-ignore-next-line
|
||||
$output = $node->getAttribute("title");
|
||||
}
|
||||
}
|
||||
|
||||
// if there is no link text, but a title attr
|
||||
// @phpstan-ignore-next-line
|
||||
if (!$output && $node->getAttribute("title")) {
|
||||
// @phpstan-ignore-next-line
|
||||
$output = $node->getAttribute("title");
|
||||
}
|
||||
|
||||
if ($href == null) {
|
||||
// it doesn't link anywhere
|
||||
// @phpstan-ignore-next-line
|
||||
if ($node->getAttribute("name") != null) {
|
||||
if ($options['drop_links']) {
|
||||
$output = "$output";
|
||||
} else {
|
||||
$output = "[$output]";
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if ($href == $output || $href == "mailto:$output" || $href == "http://$output" || $href == "https://$output") {
|
||||
// link to the same address: just use link
|
||||
$output = "$output";
|
||||
} else {
|
||||
// replace it
|
||||
if ($output) {
|
||||
if ($options['drop_links']) {
|
||||
$output = "$output";
|
||||
} else {
|
||||
$output = "[$output]($href)";
|
||||
}
|
||||
} else {
|
||||
// empty string
|
||||
$output = "$href";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// does the next node require additional whitespace?
|
||||
switch ($nextName) {
|
||||
case "h1": case "h2": case "h3": case "h4": case "h5": case "h6":
|
||||
$output .= "\n";
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
case "img":
|
||||
// @phpstan-ignore-next-line
|
||||
if ($node->getAttribute("title")) {
|
||||
// @phpstan-ignore-next-line
|
||||
$output = "[" . $node->getAttribute("title") . "]";
|
||||
// @phpstan-ignore-next-line
|
||||
} elseif ($node->getAttribute("alt")) {
|
||||
// @phpstan-ignore-next-line
|
||||
$output = "[" . $node->getAttribute("alt") . "]";
|
||||
} else {
|
||||
$output = "";
|
||||
}
|
||||
break;
|
||||
|
||||
case "li":
|
||||
$output .= "\n";
|
||||
break;
|
||||
|
||||
case "blockquote":
|
||||
// process quoted text for whitespace/newlines
|
||||
$output = self::processWhitespaceNewlines($output);
|
||||
|
||||
// add leading newline
|
||||
$output = "\n" . $output;
|
||||
|
||||
// prepend '> ' at the beginning of all lines
|
||||
$output = preg_replace("/\n/im", "\n> ", $output);
|
||||
|
||||
// replace leading '> >' with '>>'
|
||||
$output = preg_replace("/\n> >/im", "\n>>", $output);
|
||||
|
||||
// add another leading newline and trailing newlines
|
||||
$output = "\n" . $output . "\n\n";
|
||||
break;
|
||||
default:
|
||||
// do nothing
|
||||
}
|
||||
|
||||
return $output;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,15 @@
|
|||
<?php
|
||||
|
||||
namespace Soundasleep;
|
||||
|
||||
class Html2TextException extends \Exception {
|
||||
|
||||
/** @var string $more_info */
|
||||
public $more_info;
|
||||
|
||||
public function __construct(string $message = "", string $more_info = "") {
|
||||
parent::__construct($message);
|
||||
$this->more_info = $more_info;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,97 @@
|
|||
<?php
|
||||
|
||||
require(__DIR__ . "/../src/Html2Text.php");
|
||||
|
||||
class Html2TextTest extends \PHPUnit\Framework\TestCase {
|
||||
|
||||
// delete all failures before we run
|
||||
public static function setUpBeforeClass(): void {
|
||||
foreach (new DirectoryIterator(__DIR__ . '/failures') as $fileInfo) {
|
||||
if ($fileInfo->getFileName()[0] != '.') {
|
||||
unlink($fileInfo->getPathname());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @dataProvider providerFiles
|
||||
*/
|
||||
public function testFile(string $test): void {
|
||||
$this->doTestWithResults($test, $test, []);
|
||||
}
|
||||
|
||||
/** @param bool | array<string, bool | string> $options */
|
||||
function doTestWithResults(string $test, string $result, $options = []): void {
|
||||
$html = __DIR__ . "/html/$test.html";
|
||||
$txt = __DIR__ . "/txt/$result.txt";
|
||||
$this->assertTrue(file_exists($html), "File '{$html}' does not exist");
|
||||
$this->assertTrue(file_exists($txt), "File '{$txt}' does not exist");
|
||||
$input = file_get_contents($html);
|
||||
$expected = \Soundasleep\Html2Text::fixNewlines(file_get_contents($txt));
|
||||
|
||||
$output = \Soundasleep\Html2Text::convert($input, $options);
|
||||
|
||||
if ($output != $expected) {
|
||||
file_put_contents(__DIR__ . "/failures/$result.output", $output);
|
||||
}
|
||||
$this->assertEquals($expected, $output, "{$html} file failed to convert to {$txt}");
|
||||
}
|
||||
|
||||
/** @return array<array<string>> */
|
||||
public function providerFiles(): array {
|
||||
return [
|
||||
['basic'],
|
||||
['anchors'],
|
||||
['more-anchors'],
|
||||
['test3'],
|
||||
['test4'],
|
||||
['table'],
|
||||
['nbsp'],
|
||||
['lists'],
|
||||
['pre'],
|
||||
['newlines'],
|
||||
['nested-divs'],
|
||||
['blockquotes'],
|
||||
['full_email'],
|
||||
['images'],
|
||||
['non-breaking-spaces'],
|
||||
['utf8-example'],
|
||||
['msoffice'],
|
||||
['dom-processing'],
|
||||
['empty'],
|
||||
['huge-msoffice'],
|
||||
['zero-width-non-joiners'],
|
||||
];
|
||||
}
|
||||
|
||||
public function testInvalidXML(): void {
|
||||
$this->expectWarning();
|
||||
$this->doTestWithResults("invalid", "invalid", ['ignore_errors' => false]);
|
||||
}
|
||||
|
||||
public function testInvalidXMLIgnore(): void {
|
||||
$this->doTestWithResults("invalid", "invalid", ['ignore_errors' => true]);
|
||||
}
|
||||
|
||||
public function testInvalidXMLIgnoreOldSyntax(): void {
|
||||
// for BC, allow old #convert(text, bool) syntax
|
||||
$this->doTestWithResults("invalid", "invalid", true);
|
||||
}
|
||||
|
||||
public function testInvalidOption(): void {
|
||||
$this->expectException(InvalidArgumentException::class);
|
||||
$this->doTestWithResults("basic", "basic", ['invalid_option' => true]);
|
||||
}
|
||||
|
||||
public function testBasicDropLinks(): void {
|
||||
$this->doTestWithResults("basic", "basic.no-links", ['drop_links' => true]);
|
||||
}
|
||||
|
||||
public function testAnchorsDropLinks(): void {
|
||||
$this->doTestWithResults("anchors", "anchors.no-links", ['drop_links' => true]);
|
||||
}
|
||||
|
||||
public function testWindows1252(): void {
|
||||
$this->doTestWithResults("windows-1252-example", "windows-1252-example", ['char_set' => 'windows-1252']);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,8 @@
|
|||
# Ignore everything
|
||||
*
|
||||
|
||||
# But not these files...
|
||||
!.gitignore
|
||||
|
||||
# ...even if they are in subdirectories
|
||||
!*/
|
|
@ -0,0 +1,12 @@
|
|||
A document without any HTML open/closing tags.
|
||||
|
||||
<hr>
|
||||
|
||||
We try and use the representation given by common browsers of the
|
||||
HTML document, so that it looks similar when converted to plain text.
|
||||
|
||||
<a href="http://foo.com">visit foo.com</a> - or <a href="http://www.foo.com">http://www.foo.com</a>
|
||||
|
||||
<a href="http://foo.com" title="a link with a title">link</a>
|
||||
|
||||
<h2><a name="anchor">An anchor which will not appear</a></h2>
|
|
@ -0,0 +1,21 @@
|
|||
<html>
|
||||
<title>Ignored Title</title>
|
||||
<body>
|
||||
<h1>Hello, World!</h1>
|
||||
|
||||
<p>This is some e-mail content.
|
||||
Even though it has whitespace and newlines, the e-mail converter
|
||||
will handle it correctly.
|
||||
|
||||
<p>Even mismatched tags.</p>
|
||||
|
||||
<div>A div</div>
|
||||
<div>Another div</div>
|
||||
<div>A div<div>within a div</div></div>
|
||||
|
||||
<p>Another line<br />Yet another line</p>
|
||||
|
||||
<a href="http://foo.com">A link</a>
|
||||
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,43 @@
|
|||
|
||||
<span>Hello</span>
|
||||
<blockquote>
|
||||
Nest some block quotes with preformated text
|
||||
<blockquote>
|
||||
Here is the code
|
||||
<pre>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
|
||||
int main(){
|
||||
return 0;
|
||||
};
|
||||
|
||||
</pre>
|
||||
|
||||
<b>Put some tags</b>
|
||||
<i>at the end</i>
|
||||
</blockquote>
|
||||
|
||||
Some text <span>and tags</span> here
|
||||
|
||||
<blockquote>
|
||||
First line
|
||||
<h1>Header 1</h1>
|
||||
Some text
|
||||
<hr>
|
||||
Some more text
|
||||
<p>Paragraph tag!</p>
|
||||
<h2>Header 2</h2>
|
||||
<hr>
|
||||
<h3>Header 3</h3>
|
||||
Some text
|
||||
<h4>Header 4</h4>
|
||||
<blockquote>
|
||||
More quoted text!
|
||||
</blockquote>
|
||||
<p>Paragraph tag!</p>
|
||||
Final line
|
||||
</blockquote>
|
||||
</blockquote>
|
||||
Some ending text
|
||||
<b>just to make sure</b>
|
|
@ -0,0 +1,8 @@
|
|||
<html>
|
||||
<body>
|
||||
<?a
|
||||
I am a random piece of code
|
||||
?>
|
||||
Hello
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,220 @@
|
|||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
|
||||
<meta name="viewport" content="width=680">
|
||||
</head>
|
||||
<body class="cat-update-email cat-update" style="background: #ffccee; color: blue; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 0; text-align: center" bgcolor="#ffccee">
|
||||
<style type="text/css">
|
||||
body.cat-update-email {
|
||||
margin: 0; padding: 0; background: #ffccee; color: blue; text-align: center;
|
||||
}
|
||||
body.cat-update-email {
|
||||
font-size: 12px; font-family: Times New Roman; font-weight: normal;
|
||||
}
|
||||
body.cat-update-email th {
|
||||
font-size: 12px; font-family: Times New Roman; font-weight: normal;
|
||||
}
|
||||
body.cat-update-email td {
|
||||
font-size: 12px; font-family: Times New Roman; font-weight: normal;
|
||||
}
|
||||
</style>
|
||||
<table class="header-wrapper" style="border-spacing: 0; border: none; margin: 0; width: 100%">
|
||||
<tr>
|
||||
<td class="header" style="background: none; color: #999; font-family: Times New Roman; font-size: 12px; font-weight: normal; padding: 15px 0">
|
||||
<table cellspacing="0" cellpadding="0" border="0" style="margin: 0 auto; padding: 0 20px; width: 640px">
|
||||
<tr>
|
||||
<th style="font-family: Times New Roman; font-size: 12px; font-weight: normal">
|
||||
<a class="logo" href="http://localhost/home" style="color: red; text-decoration: none">
|
||||
<img border="0" height="32" src="test.png" width="200" style="display: block">
|
||||
</a> </th>
|
||||
<td class="account-number" style="color: white; font-family: Times New Roman; font-size: 12px; font-weight: normal; text-align: right" align="right">
|
||||
16 December 2015<br>
|
||||
Account 123
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
<table class="section-wrapper" style="border-spacing: 0; border: none; margin: 0 auto 20px; width: 640px">
|
||||
<tr>
|
||||
<td class="salutation section" style="background: white; color: black; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0 auto 20px; padding: 40px 20px; text-align: left; width: 600px" align="left" bgcolor="white">
|
||||
<h1 class="user_greeting" style="font-family: Times New Roman; font-size: 1.8; font-weight: normal; line-height: 1.2; margin: 0 0 1em">
|
||||
Hi Susan
|
||||
</h1>
|
||||
<p class="message" style="font-size: 1.5em; line-height: 1.2; margin: 0">
|
||||
Here is your cat report.
|
||||
</p>
|
||||
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
|
||||
|
||||
|
||||
<table class="section-wrapper" style="border-spacing: 0; border: none; margin: 0 auto 20px; width: 640px">
|
||||
<tr>
|
||||
<td class="balance section" style="background: white; color: black; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0 auto 20px; padding: 40px 20px; text-align: left; width: 600px" align="left" bgcolor="white">
|
||||
<div class="account-status-heading" style="font-size: 2.5em; line-height: 1em; padding: 30px 20px; text-align: center" align="center">You have found <span class="status-cats-negative" style="color: #df0000">5 cats</span> less than anyone else</div>
|
||||
|
||||
<div id="cat-update-action-buttons">
|
||||
<div id="buy-button" style="text-align: center" align="center">
|
||||
<a class="btn-alert" href="http://localhost/cats" id="buy-cats-button" style="-moz-appearance: none; -webkit-appearance: none; background: #DF0000; border-radius: 3px; border: 11px solid #df0000; color: #fff; cursor: pointer; display: block; font-size: 16px; height: 16px; line-height: 16px; margin: 0 auto; text-decoration: none; transition: background-color .15s; width: 120px">Find more cats</a>
|
||||
</div>
|
||||
</div>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
<table class="section-wrapper" style="border-spacing: 0; border: none; margin: 0 auto 20px; width: 640px">
|
||||
<tr>
|
||||
<td class="cats section" id="cats" style="background: white; color: black; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0 auto 20px; padding: 40px 20px; text-align: left; width: 600px" align="left" bgcolor="white">
|
||||
<div class="cats-usage">
|
||||
<h2 style="font-family: Times New Roman; font-size: 1.8; font-weight: normal; line-height: 1.2; margin: 0">Down the road</h2>
|
||||
<p class="fine-print" style="margin: 0">Across the hall</p>
|
||||
|
||||
<h3 style="font-family: Times New Roman; font-size: 18px; font-weight: normal; line-height: 2em; margin: 10px 0 0">Your achievements</h3>
|
||||
<table class="current-usage with-icon-left" style="border-collapse: collapse; border-spacing: 0; margin-bottom: 20px; margin-top: 20px; width: 100%">
|
||||
<tr>
|
||||
<th style="border: none; font-family: Times New Roman; font-size: 14px; font-weight: bold; margin: 0; padding: 0; text-align: left; vertical-align: middle; width: 50px" align="left" valign="middle"><img src="test.png"></th>
|
||||
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 0; vertical-align: top; width: 550px" valign="top">
|
||||
<div class="top">You're currently finding about</div>
|
||||
<div class="large" style="color: black; font-size: 18px; padding: 4px 0">12 cats</div>
|
||||
<div class="bottom">per day</div>
|
||||
</td>
|
||||
</tr>
|
||||
<tr><td colspan="2" style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 0; vertical-align: top; width: 550px" valign="top"> </td></tr>
|
||||
<tr>
|
||||
<td colspan="2" style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 0; vertical-align: top; width: 550px" valign="top"><img alt="Number of cats found" src="test.png"></td>
|
||||
</tr>
|
||||
</table>
|
||||
</div>
|
||||
|
||||
|
||||
<div class="summary">
|
||||
<hr class="fine-print" style="border-bottom-color: #eee; border-bottom-style: solid; border-width: 0 0 1px; margin: 20px 0">
|
||||
|
||||
<h3 style="font-family: Times New Roman; font-size: 18px; font-weight: normal; line-height: 2em; margin: 10px 0 0">Your last cat was found two days ago.</h3>
|
||||
<p class="fine-print" style="margin: 0">One type of cat is a kitten.</p>
|
||||
|
||||
<table class="readings" style="border-collapse: collapse; border-spacing: 0; margin: 10px 0; width: 100%">
|
||||
<tr style="color: #BD236C">
|
||||
<td class="left-column" style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 0; width: 5%">
|
||||
<img src="test.png" style="padding-top: 10px">
|
||||
</td>
|
||||
<td class="center-column" style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 0; width: 60%">
|
||||
<h3 style="font-family: Times New Roman; font-size: 18px; font-weight: normal; line-height: 2em; margin: 10px 0 0">Special account <span class="nickname" style="font-size: 12px"></span> <span class="fine-print">A1</span>
|
||||
</h3>
|
||||
</td>
|
||||
<td class="right-column" style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 0; width: 20%">
|
||||
<h3 style="font-family: Times New Roman; font-size: 18px; font-weight: normal; line-height: 2em; margin: 10px 0 0">12.345</h3>
|
||||
</td>
|
||||
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 0"></td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
</div>
|
||||
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
<div class="banner" style="margin: 0 auto 20px; padding: 10px; text-align: center; width: 640px" align="center">
|
||||
<a href="http://localhost/logout" style="color: red; text-decoration: none">
|
||||
<img alt="" border="0" height="177" src="http://localhost/photo1.png" width="600">
|
||||
</a>
|
||||
</div>
|
||||
|
||||
<table class="section-wrapper" style="border-spacing: 0; border: none; margin: 0 auto 20px; width: 640px">
|
||||
<tr>
|
||||
<td class="tips section" style="background: white; color: black; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0 auto 20px; padding: 40px 20px; text-align: left; width: 600px" align="left" bgcolor="white">
|
||||
<table style="border-collapse: collapse; border-spacing: 0; width: 100%">
|
||||
<tr>
|
||||
<td colspan="3" style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0; vertical-align: top; width: 200px" valign="top"><h2 style="font-family: Times New Roman; font-size: 1.8; font-weight: normal; line-height: 1.2; margin: 0 0 10px">How can you find more cats?</h2></td>
|
||||
</tr>
|
||||
|
||||
<tr class="icon">
|
||||
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0; vertical-align: top; width: 200px" valign="top"><img height="40" src="http://localhost/photo1.png" width="40"></td>
|
||||
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0 17px; vertical-align: top; width: 200px" valign="top"><img height="40" src="http://localhost/photo2.png" width="40"></td>
|
||||
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0 17px; vertical-align: top; width: 200px" valign="top"><img height="40" src="http://localhost/photo3.png" width="40"></td>
|
||||
</tr>
|
||||
|
||||
<tr class="subtitle">
|
||||
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0; vertical-align: top; width: 200px" valign="top"><h3 style="font-family: Times New Roman; font-size: 18px; font-weight: normal; line-height: 2em; margin: 0 0 5px">Look in trash cans</h3></td>
|
||||
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0 17px; vertical-align: top; width: 200px" valign="top"><h3 style="font-family: Times New Roman; font-size: 18px; font-weight: normal; line-height: 2em; margin: 0 0 5px">Start meowing</h3></td>
|
||||
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0 17px; vertical-align: top; width: 200px" valign="top"><h3 style="font-family: Times New Roman; font-size: 18px; font-weight: normal; line-height: 2em; margin: 0 0 5px">Eat cat food</h3></td>
|
||||
</tr>
|
||||
|
||||
<tr class="body" style="color: green">
|
||||
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0; vertical-align: top; width: 200px" valign="top">Some cats like to hang out in trash cans. Some cats do not.</td>
|
||||
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0 17px; vertical-align: top; width: 200px" valign="top">Some cats are attracted to similar tones.</td>
|
||||
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0 17px; vertical-align: top; width: 200px" valign="top">So one day your tears may smell like cat food, attracting more cats.</td>
|
||||
</tr>
|
||||
|
||||
<tr class="image">
|
||||
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0; vertical-align: top; width: 200px" valign="top">
|
||||
<a href="https://localhost/about" style="color: red; text-decoration: none">
|
||||
<img border="0" height="130" src="http://localhost/photo1.png" style="display: block; margin: 10px 0" width="165">
|
||||
</a>
|
||||
</td>
|
||||
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0 17px; vertical-align: top; width: 200px" valign="top">
|
||||
<a href="https://localhost/about" style="color: red; text-decoration: none">
|
||||
<img border="0" height="130" src="http://localhost/photo2.png" style="display: block; margin: 10px 0" width="165">
|
||||
</a>
|
||||
</td>
|
||||
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0 17px; vertical-align: top; width: 200px" valign="top">
|
||||
<a href="https://localhost/about" style="color: red; text-decoration: none">
|
||||
<img border="0" height="130" src="http://localhost/photo3.png" style="display: block; margin: 10px 0" width="165">
|
||||
</a>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr class="tips-footer" style="color: green">
|
||||
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0; vertical-align: top; width: 200px" valign="top">
|
||||
<a href="https://github.com/soundasleep/html2text_ruby" style="color: red; text-decoration: none">Cats are great.</a>
|
||||
</td>
|
||||
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0 17px; vertical-align: top; width: 200px" valign="top">
|
||||
<a href="https://github.com/soundasleep/html2text_ruby" style="color: red; text-decoration: none">Find more cats.</a>
|
||||
</td>
|
||||
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0 17px; vertical-align: top; width: 200px" valign="top">
|
||||
<a href="https://github.com/soundasleep/html2text_ruby" style="color: red; text-decoration: none">Do more things.</a>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<table class="footer-wrapper" style="margin: 0 auto 20px">
|
||||
<tr>
|
||||
<td class="footer" style="color: #9B9B9B; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0 auto 4em; text-align: left; width: 600px" align="left">
|
||||
<h3 style="font-family: Times New Roman; font-size: 1.2; font-weight: normal; line-height: 2em; margin: 0">
|
||||
<a href="http://localhost/contact" style="color: red; text-decoration: none">Contact us</a>
|
||||
</h3>
|
||||
<p style="margin: 0 0 1em">
|
||||
cats@cats.com<br>
|
||||
Monday and Friday
|
||||
</p>
|
||||
|
||||
<p style="margin: 0 0 1em"><a href="https://github.com/soundasleep/html2text" style="color: red; text-decoration: none"><img align="absmiddle" height="26" src="test.png" width="26"></a>
|
||||
<a href="https://github.com/soundasleep/html2text_ruby" style="color: red; text-decoration: none"><img align="absmiddle" height="26" src="test.png" width="26"></a>
|
||||
</p>
|
||||
|
||||
<p class="message no-web-display" style="margin: 0">Having trouble seeing this email?
|
||||
<a href="http://localhost/view_it_online" style="color: red; text-decoration: none">View it online</a>.
|
||||
</p>
|
||||
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
<script async type="text/javascript" id="profiler" src="/profiler.js" data-version="1.0"></script>
|
||||
</body>
|
||||
</html>
|
||||
|
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,54 @@
|
|||
<body>
|
||||
<p>
|
||||
One: <img src="one.png">
|
||||
</p>
|
||||
|
||||
<p>
|
||||
Two: <img src="two.png" alt="two">
|
||||
</p>
|
||||
|
||||
<p>
|
||||
Three: <img src="three.png" title="three">
|
||||
</p>
|
||||
|
||||
<p>
|
||||
Four: <img src="four.png" title="four" alt="four alt">
|
||||
</p>
|
||||
|
||||
<h1>With links</h1>
|
||||
|
||||
<p>
|
||||
One: <a href="http://localhost"><img src="one.png"></a>
|
||||
</p>
|
||||
|
||||
<p>
|
||||
Two: <a href="http://localhost"><img src="two.png" alt="two"></a>
|
||||
</p>
|
||||
|
||||
<p>
|
||||
Three: <a href="http://localhost"><img src="three.png" title="three"></a>
|
||||
</p>
|
||||
|
||||
<p>
|
||||
Four: <a href="http://localhost"><img src="four.png" title="four" alt="four alt"></a>
|
||||
</p>
|
||||
|
||||
<h1>With links with titles</h1>
|
||||
|
||||
<p>
|
||||
One: <a href="http://localhost" title="one link"><img src="one.png"></a>
|
||||
</p>
|
||||
|
||||
<p>
|
||||
Two: <a href="http://localhost" title="two link"><img src="two.png" alt="two"></a>
|
||||
</p>
|
||||
|
||||
<p>
|
||||
Three: <a href="http://localhost" title="three link"><img src="three.png" title="three"></a>
|
||||
</p>
|
||||
|
||||
<p>
|
||||
Four: <a href="http://localhost" title="four link"><img src="four.png" title="four" alt="four alt"></a>
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,4 @@
|
|||
<b>Hello &nbsnbsp; world</b>
|
||||
<div class=">
|
||||
Error
|
||||
</div>
|
|
@ -0,0 +1,24 @@
|
|||
<h1>List tests</h1>
|
||||
|
||||
<p>
|
||||
Add some lists.
|
||||
</p>
|
||||
|
||||
<ol>
|
||||
<li>one</li>
|
||||
<li>two
|
||||
<li>three
|
||||
</ol>
|
||||
|
||||
<h2>An unordered list</h2>
|
||||
|
||||
<ul>
|
||||
<li>one
|
||||
<li>two</li>
|
||||
<li>three</li>
|
||||
</ul>
|
||||
<ul>
|
||||
<li>one
|
||||
<li>two</li>
|
||||
<li>three</li>
|
||||
</ul>
|
|
@ -0,0 +1,14 @@
|
|||
<h1>Anchor tests</h1>
|
||||
|
||||
<p>
|
||||
Visit http://openiaml.org or <a href="http://openiaml.org">openiaml.org</a> or <a href="http://openiaml.org">http://openiaml.org</a>.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
To visit with SSL, visit https://openiaml.org or <a href="https://openiaml.org">openiaml.org</a> or <a href="https://openiaml.org">https://openiaml.org</a>.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
To mail, email support@openiaml.org or mailto:support@openiaml.org
|
||||
or <a href="mailto:support@openiaml.org">support@openiaml.org</a> or <a href="mailto:support@openiaml.org">mailto:support@openiaml.org</a>.
|
||||
</p>
|
|
@ -0,0 +1 @@
|
|||
<html xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:x="urn:schemas-microsoft-com:office:excel" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40"><head><META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=us-ascii"><meta name=Generator content="Microsoft Word 15 (filtered medium)"><style><!-- /* Font Definitions */ @font-face {font-family:"Cambria Math"; panose-1:2 4 5 3 5 4 6 3 2 4;} @font-face {font-family:Calibri; panose-1:2 15 5 2 2 2 4 3 2 4;} /* Style Definitions */ p.MsoNormal, li.MsoNormal, div.MsoNormal {margin:0cm; margin-bottom:.0001pt; font-size:11.0pt; font-family:"Calibri",sans-serif; mso-fareast-language:EN-US;} a:link, span.MsoHyperlink {mso-style-priority:99; color:#0563C1; text-decoration:underline;} a:visited, span.MsoHyperlinkFollowed {mso-style-priority:99; color:#954F72; text-decoration:underline;} span.EmailStyle17 {mso-style-type:personal-compose; font-family:"Calibri",sans-serif; color:windowtext;} .MsoChpDefault {mso-style-type:export-only; font-family:"Calibri",sans-serif; mso-fareast-language:EN-US;} @page WordSection1 {size:612.0pt 792.0pt; margin:72.0pt 72.0pt 72.0pt 72.0pt;} div.WordSection1 {page:WordSection1;} --></style><!--[if gte mso 9]><xml> <o:shapedefaults v:ext="edit" spidmax="1026" /> </xml><![endif]--><!--[if gte mso 9]><xml> <o:shapelayout v:ext="edit"> <o:idmap v:ext="edit" data="1" /> </o:shapelayout></xml><![endif]--></head><body lang=EN-GB link="#0563C1" vlink="#954F72"><div class=WordSection1><p class=MsoNormal>Dear html2text,<o:p></o:p></p><p class=MsoNormal><o:p> </o:p></p><p class=MsoNormal>This is an example email that can be used to test html2text conversion of outlook / exchange emails.<o:p></o:p></p><p class=MsoNormal><o:p> </o:p></p><p class=MsoNormal>The addition of <o:p> tags is very annoying!<o:p></o:p></p><p class=MsoNormal>This is a single line return<o:p></o:p></p><p class=MsoNormal><o:p> </o:p></p><p class=MsoNormal><b>This is bold<o:p></o:p></b></p><p class=MsoNormal><i>This is italic<o:p></o:p></i></p><p class=MsoNormal><u>This is underline<o:p></o:p></u></p><p class=MsoNormal><o:p> </o:p></p><p class=MsoNormal>Andrew<o:p></o:p></p></div></body></html>
|
|
@ -0,0 +1 @@
|
|||
hello world & people < > &NBSP;
|
|
@ -0,0 +1,17 @@
|
|||
<html>
|
||||
<body>
|
||||
<div>
|
||||
Just two divs
|
||||
</div>
|
||||
<div>
|
||||
Hanging out
|
||||
</div>
|
||||
<div><div><div>Nested divs and line breaks</div></div><br></div>
|
||||
<div><div>Nested divs and line breaks</div>More text<br></div>
|
||||
<div><br></div>
|
||||
<div>Just text</div>
|
||||
<div>Just text<br></div>
|
||||
<div>Just text<br><br></div>
|
||||
This is the end!
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,50 @@
|
|||
<html>
|
||||
<body>
|
||||
<div>
|
||||
Hello
|
||||
<br>
|
||||
</div>
|
||||
<div>
|
||||
How are you?
|
||||
<br>
|
||||
</div>
|
||||
|
||||
<p>
|
||||
How are you?
|
||||
<br>
|
||||
</p>
|
||||
|
||||
<p>
|
||||
How are you?
|
||||
<br>
|
||||
</p>
|
||||
|
||||
<div>
|
||||
Just two divs
|
||||
</div>
|
||||
<div>
|
||||
Hanging out
|
||||
</div>
|
||||
|
||||
This is not the end!
|
||||
<div>
|
||||
How are you again?
|
||||
<br>
|
||||
</div>
|
||||
This is the end!
|
||||
<br>
|
||||
Just kidding
|
||||
<h1>Header 1</h1>
|
||||
Some text
|
||||
<hr>
|
||||
Some more text
|
||||
<p>Paragraph tag!</p>
|
||||
<h2>Header 2</h2>
|
||||
<hr>
|
||||
<h3>Header 3</h3>
|
||||
Some text
|
||||
<h4>Header 4</h4>
|
||||
<p>Paragraph tag!</p>
|
||||
Final line
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1 @@
|
|||
these spaces are non-breaking
|
|
@ -0,0 +1,10 @@
|
|||
Here is the code
|
||||
<pre>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
|
||||
int main(){
|
||||
return 0;
|
||||
};
|
||||
|
||||
</pre>
|
|
@ -0,0 +1,53 @@
|
|||
<html>
|
||||
<title>Ignored Title</title>
|
||||
<body>
|
||||
<h1>Hello, World!</h1>
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Col A</th>
|
||||
<th>Col B</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>
|
||||
Data A1
|
||||
</td>
|
||||
<td>
|
||||
Data B1
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
Data A2
|
||||
</td>
|
||||
<td>
|
||||
Data B2
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
Data A3
|
||||
</td>
|
||||
<td>
|
||||
Data B4
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
<tfoot>
|
||||
<tr>
|
||||
<td>
|
||||
Total A
|
||||
</td>
|
||||
<td>
|
||||
Total B
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
</tfoot>
|
||||
|
||||
</table>
|
||||
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1 @@
|
|||
test one<br />test two
|
|
@ -0,0 +1 @@
|
|||
1<br />2<br />3<br />4<br />5 < 6
|
|
@ -0,0 +1,4 @@
|
|||
<ul>
|
||||
<li>ÅÄÖ</li>
|
||||
<li>åäö</li>
|
||||
</ul>
|
|
@ -0,0 +1,4 @@
|
|||
<ul>
|
||||
<li>ÅÄÖ</li>
|
||||
<li>åäö</li>
|
||||
</ul>
|
|
@ -0,0 +1 @@
|
|||
<p>foo‌bar</p>
|
|
@ -0,0 +1,5 @@
|
|||
A document without any HTML open/closing tags.
|
||||
---------------------------------------------------------------
|
||||
We try and use the representation given by common browsers of the HTML document, so that it looks similar when converted to plain text. visit foo.com - or http://www.foo.com link
|
||||
|
||||
An anchor which will not appear
|
|
@ -0,0 +1,5 @@
|
|||
A document without any HTML open/closing tags.
|
||||
---------------------------------------------------------------
|
||||
We try and use the representation given by common browsers of the HTML document, so that it looks similar when converted to plain text. [visit foo.com](http://foo.com) - or http://www.foo.com [link](http://foo.com)
|
||||
|
||||
[An anchor which will not appear]
|
|
@ -0,0 +1,15 @@
|
|||
Hello, World!
|
||||
|
||||
This is some e-mail content. Even though it has whitespace and newlines, the e-mail converter will handle it correctly.
|
||||
|
||||
Even mismatched tags.
|
||||
|
||||
A div
|
||||
Another div
|
||||
A div
|
||||
within a div
|
||||
|
||||
Another line
|
||||
Yet another line
|
||||
|
||||
A link
|
|
@ -0,0 +1,15 @@
|
|||
Hello, World!
|
||||
|
||||
This is some e-mail content. Even though it has whitespace and newlines, the e-mail converter will handle it correctly.
|
||||
|
||||
Even mismatched tags.
|
||||
|
||||
A div
|
||||
Another div
|
||||
A div
|
||||
within a div
|
||||
|
||||
Another line
|
||||
Yet another line
|
||||
|
||||
[A link](http://foo.com)
|
|
@ -0,0 +1,44 @@
|
|||
Hello
|
||||
|
||||
> Nest some block quotes with preformated text
|
||||
>
|
||||
>> Here is the code
|
||||
>>
|
||||
>> #include <stdlib.h>
|
||||
>> #include <stdio.h>
|
||||
>>
|
||||
>> int main(){
|
||||
>> return 0;
|
||||
>> };
|
||||
>>
|
||||
>> Put some tags at the end
|
||||
>
|
||||
> Some text and tags here
|
||||
>
|
||||
>> First line
|
||||
>>
|
||||
>> Header 1
|
||||
>>
|
||||
>> Some text
|
||||
>> ---------------------------------------------------------------
|
||||
>> Some more text
|
||||
>>
|
||||
>> Paragraph tag!
|
||||
>>
|
||||
>> Header 2
|
||||
>>
|
||||
>> ---------------------------------------------------------------
|
||||
>>
|
||||
>> Header 3
|
||||
>>
|
||||
>> Some text
|
||||
>>
|
||||
>> Header 4
|
||||
>>
|
||||
>>> More quoted text!
|
||||
>>
|
||||
>> Paragraph tag!
|
||||
>>
|
||||
>> Final line
|
||||
|
||||
Some ending text just to make sure
|
|
@ -0,0 +1 @@
|
|||
Hello
|
|
@ -0,0 +1,53 @@
|
|||
http://localhost/home 16 December 2015
|
||||
Account 123
|
||||
|
||||
Hi Susan
|
||||
|
||||
Here is your cat report.
|
||||
|
||||
You have found 5 cats less than anyone else
|
||||
[Find more cats](http://localhost/cats)
|
||||
|
||||
Down the road
|
||||
|
||||
Across the hall
|
||||
|
||||
Your achievements
|
||||
|
||||
You're currently finding about
|
||||
12 cats
|
||||
per day
|
||||
|
||||
[Number of cats found]
|
||||
---------------------------------------------------------------
|
||||
|
||||
Your last cat was found two days ago.
|
||||
|
||||
One type of cat is a kitten.
|
||||
|
||||
Special account A1
|
||||
|
||||
12.345
|
||||
|
||||
http://localhost/logout
|
||||
|
||||
How can you find more cats?
|
||||
|
||||
Look in trash cans
|
||||
|
||||
Start meowing
|
||||
|
||||
Eat cat food
|
||||
|
||||
Some cats like to hang out in trash cans. Some cats do not. Some cats are attracted to similar tones. So one day your tears may smell like cat food, attracting more cats.
|
||||
https://localhost/about https://localhost/about https://localhost/about
|
||||
[Cats are great.](https://github.com/soundasleep/html2text_ruby) [Find more cats.](https://github.com/soundasleep/html2text_ruby) [Do more things.](https://github.com/soundasleep/html2text_ruby)
|
||||
|
||||
[Contact us](http://localhost/contact)
|
||||
|
||||
cats@cats.com
|
||||
Monday and Friday
|
||||
|
||||
https://github.com/soundasleep/html2text https://github.com/soundasleep/html2text_ruby
|
||||
|
||||
Having trouble seeing this email? [View it online](http://localhost/view_it_online).
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,27 @@
|
|||
One:
|
||||
|
||||
Two: [two]
|
||||
|
||||
Three: [three]
|
||||
|
||||
Four: [four]
|
||||
|
||||
With links
|
||||
|
||||
One: http://localhost
|
||||
|
||||
Two: [two](http://localhost)
|
||||
|
||||
Three: [three](http://localhost)
|
||||
|
||||
Four: [four](http://localhost)
|
||||
|
||||
With links with titles
|
||||
|
||||
One: [one link](http://localhost)
|
||||
|
||||
Two: [two link](http://localhost)
|
||||
|
||||
Three: [three link](http://localhost)
|
||||
|
||||
Four: [four link](http://localhost)
|
|
@ -0,0 +1 @@
|
|||
Hello &nbsnbsp; world
|
|
@ -0,0 +1,17 @@
|
|||
List tests
|
||||
|
||||
Add some lists.
|
||||
|
||||
- one
|
||||
- two
|
||||
- three
|
||||
|
||||
An unordered list
|
||||
|
||||
- one
|
||||
- two
|
||||
- three
|
||||
|
||||
- one
|
||||
- two
|
||||
- three
|
|
@ -0,0 +1,7 @@
|
|||
Anchor tests
|
||||
|
||||
Visit http://openiaml.org or openiaml.org or http://openiaml.org.
|
||||
|
||||
To visit with SSL, visit https://openiaml.org or openiaml.org or https://openiaml.org.
|
||||
|
||||
To mail, email support@openiaml.org or mailto:support@openiaml.org or support@openiaml.org or mailto:support@openiaml.org.
|
|
@ -0,0 +1,12 @@
|
|||
Dear html2text,
|
||||
|
||||
This is an example email that can be used to test html2text conversion of outlook / exchange emails.
|
||||
|
||||
The addition of <o:p> tags is very annoying!
|
||||
This is a single line return
|
||||
|
||||
This is bold
|
||||
This is italic
|
||||
This is underline
|
||||
|
||||
Andrew
|
|
@ -0,0 +1 @@
|
|||
hello world & people < > &NBSP;
|
|
@ -0,0 +1,12 @@
|
|||
Just two divs
|
||||
Hanging out
|
||||
Nested divs and line breaks
|
||||
|
||||
Nested divs and line breaks
|
||||
More text
|
||||
|
||||
Just text
|
||||
Just text
|
||||
Just text
|
||||
|
||||
This is the end!
|
|
@ -0,0 +1,35 @@
|
|||
Hello
|
||||
How are you?
|
||||
|
||||
How are you?
|
||||
|
||||
How are you?
|
||||
|
||||
Just two divs
|
||||
Hanging out
|
||||
This is not the end!
|
||||
How are you again?
|
||||
This is the end!
|
||||
Just kidding
|
||||
|
||||
Header 1
|
||||
|
||||
Some text
|
||||
---------------------------------------------------------------
|
||||
Some more text
|
||||
|
||||
Paragraph tag!
|
||||
|
||||
Header 2
|
||||
|
||||
---------------------------------------------------------------
|
||||
|
||||
Header 3
|
||||
|
||||
Some text
|
||||
|
||||
Header 4
|
||||
|
||||
Paragraph tag!
|
||||
|
||||
Final line
|
|
@ -0,0 +1 @@
|
|||
these spaces are non-breaking
|
|
@ -0,0 +1,8 @@
|
|||
Here is the code
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
|
||||
int main(){
|
||||
return 0;
|
||||
};
|
|
@ -0,0 +1,7 @@
|
|||
Hello, World!
|
||||
|
||||
Col A Col B
|
||||
Data A1 Data B1
|
||||
Data A2 Data B2
|
||||
Data A3 Data B4
|
||||
Total A Total B
|
|
@ -0,0 +1,2 @@
|
|||
test one
|
||||
test two
|
|
@ -0,0 +1,5 @@
|
|||
1
|
||||
2
|
||||
3
|
||||
4
|
||||
5 < 6
|
|
@ -0,0 +1,2 @@
|
|||
- ÅÄÖ
|
||||
- åäö
|
|
@ -0,0 +1,2 @@
|
|||
- ÅÄÖ
|
||||
- åäö
|
|
@ -0,0 +1 @@
|
|||
foobar
|
Loading…
Reference in New Issue