Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 106 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
name: CI

on:
push:
branches: [ main ]
pull_request:
branches: [ main ]
workflow_dispatch:

jobs:
test:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.9', '3.10', '3.11', '3.12']

steps:
- uses: actions/checkout@v4

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install pytest pytest-cov black isort mypy ruff
pip install ./packages/markitdown[all]
pip install ./packages/markitdown-mcp
pip install ./packages/markitdown-api
pip install ./packages/markitdown-sample-plugin

- name: Run linters
run: |
black --check packages/
isort --check packages/
mypy packages/
ruff check packages/

- name: Run tests with coverage
run: |
pytest --cov=markitdown --cov=markitdown_mcp --cov=markitdown_api --cov-report=xml

- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
with:
file: ./coverage.xml

security:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- name: Run Snyk to check for vulnerabilities
uses: snyk/actions/python@master
env:
SNYK_TOKEN: ${{ secrets.SNYK_TOKEN }}
with:
args: --severity-threshold=high

- name: Run Bandit
run: |
pip install bandit
bandit -r packages/ -c pyproject.toml

docker:
runs-on: ubuntu-latest
needs: [test, security]
if: github.event_name == 'push' && github.ref == 'refs/heads/main'

steps:
- uses: actions/checkout@v4

- name: Set up QEMU
uses: docker/setup-qemu-action@v3

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Build and push API image
uses: docker/build-push-action@v5
with:
context: .
file: ./packages/markitdown-api/Dockerfile
push: true
tags: |
ghcr.io/${{ github.repository }}/markitdown-api:latest
ghcr.io/${{ github.repository }}/markitdown-api:${{ github.sha }}

- name: Build and push MCP image
uses: docker/build-push-action@v5
with:
context: .
file: ./packages/markitdown-mcp/Dockerfile
push: true
tags: |
ghcr.io/${{ github.repository }}/markitdown-mcp:latest
ghcr.io/${{ github.repository }}/markitdown-mcp:${{ github.sha }}
69 changes: 69 additions & 0 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
name: Release

on:
release:
types: [published]

jobs:
pypi:
runs-on: ubuntu-latest
strategy:
matrix:
package: ['markitdown', 'markitdown-mcp', 'markitdown-api', 'markitdown-sample-plugin']

steps:
- uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.12'

- name: Install build tools
run: python -m pip install --upgrade pip build twine

- name: Build package
run: python -m build packages/${{ matrix.package }}

- name: Publish to PyPI
env:
TWINE_USERNAME: __token__
TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
run: twine upload packages/${{ matrix.package }}/dist/*

docker:
runs-on: ubuntu-latest
needs: pypi
strategy:
matrix:
package: ['markitdown-api', 'markitdown-mcp']

steps:
- uses: actions/checkout@v4

- name: Set up QEMU
uses: docker/setup-qemu-action@v3

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Extract version
id: version
run: echo "VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_OUTPUT

- name: Build and push
uses: docker/build-push-action@v5
with:
context: .
file: ./packages/${{ matrix.package }}/Dockerfile
push: true
tags: |
ghcr.io/${{ github.repository }}/${{ matrix.package }}:latest
ghcr.io/${{ github.repository }}/${{ matrix.package }}:${{ steps.version.outputs.VERSION }}
51 changes: 51 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
version: '3.8'

services:
# Development API server
api-dev:
build:
context: .
dockerfile: packages/markitdown-api/Dockerfile
ports:
- "8000:8000"
environment:
- MARKITDOWN_API_HOST=0.0.0.0
- MARKITDOWN_API_PORT=8000
- MARKITDOWN_API_WORKERS=1
- MARKITDOWN_API_ENABLE_PLUGINS=true
- MARKITDOWN_API_RATE_LIMIT=0
- MARKITDOWN_API_MAX_FILE_SIZE=52428800 # 50MB
volumes:
- ./packages/markitdown-api/src:/usr/local/lib/python3.13/site-packages/markitdown_api
command: ["python", "-m", "uvicorn", "markitdown_api:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]

# Production API server
api-prod:
build:
context: .
dockerfile: packages/markitdown-api/Dockerfile
ports:
- "8001:8000"
environment:
- MARKITDOWN_API_HOST=0.0.0.0
- MARKITDOWN_API_PORT=8000
- MARKITDOWN_API_WORKERS=2
- MARKITDOWN_API_ENABLE_PLUGINS=false
- MARKITDOWN_API_RATE_LIMIT=60
- MARKITDOWN_API_MAX_FILE_SIZE=10485760 # 10MB
deploy:
resources:
limits:
cpus: '1'
memory: 1G

# MCP server
mcp:
build:
context: .
dockerfile: packages/markitdown-mcp/Dockerfile
ports:
- "3001:3001"
environment:
- MARKITDOWN_ENABLE_PLUGINS=false
command: ["markitdown-mcp", "--http", "--host", "0.0.0.0", "--port", "3001"]
51 changes: 51 additions & 0 deletions packages/markitdown-api/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
FROM python:3.13-slim-bullseye as builder

WORKDIR /build

# Install build dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
&& rm -rf /var/lib/apt/lists/*

# Copy package files
COPY . /build

# Install dependencies
RUN pip install --no-cache-dir \
/build/packages/markitdown[all] \
/build/packages/markitdown-api


FROM python:3.13-slim-bullseye

ENV DEBIAN_FRONTEND=noninteractive
ENV EXIFTOOL_PATH=/usr/bin/exiftool
ENV FFMPEG_PATH=/usr/bin/ffmpeg
ENV MARKITDOWN_API_HOST=0.0.0.0
ENV MARKITDOWN_API_PORT=8000
ENV MARKITDOWN_API_WORKERS=2
ENV MARKITDOWN_API_ENABLE_PLUGINS=false
ENV MARKITDOWN_API_RATE_LIMIT=60
ENV MARKITDOWN_API_MAX_FILE_SIZE=10485760

# Runtime dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
ffmpeg \
exiftool \
&& rm -rf /var/lib/apt/lists/*

# Copy installed packages from builder
COPY --from=builder /usr/local/lib/python3.13/site-packages /usr/local/lib/python3.13/site-packages
COPY --from=builder /usr/local/bin/markitdown-api /usr/local/bin/markitdown-api

# Create non-root user
RUN useradd -r -s /bin/false markitdown
USER markitdown

# Health check
HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
CMD curl -f http://localhost:${MARKITDOWN_API_PORT}/health || exit 1

EXPOSE ${MARKITDOWN_API_PORT}

ENTRYPOINT ["markitdown-api"]
78 changes: 78 additions & 0 deletions packages/markitdown-api/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# MarkItDown API

[![PyPI](https://img.shields.io/pypi/v/markitdown-api.svg)](https://pypi.org/project/markitdown-api/)
[![Built by AutoGen Team](https://img.shields.io/badge/Built%20by-AutoGen%20Team-blue)](https://github.com/microsoft/autogen)

The `markitdown-api` package provides a REST API server for MarkItDown, allowing you to convert various file formats to markdown over HTTP.

## Features

- Convert files to markdown via file upload
- Convert URLs to markdown
- Support for various input formats (documents, images, audio, etc.)
- Swagger/OpenAPI documentation
- Rate limiting and authentication options
- Health monitoring endpoints

## Installation

```bash
pip install markitdown-api
```

## Usage

To start the API server:

```bash
markitdown-api
```

Or with custom host/port:

```bash
markitdown-api --host 0.0.0.0 --port 8000
```

### API Documentation

Once the server is running, you can access:

- Swagger UI: http://localhost:8000/docs
- OpenAPI spec: http://localhost:8000/openapi.json

### API Endpoints

- `POST /convert/file` - Convert uploaded file to markdown
- `POST /convert/url` - Convert URL to markdown
- `GET /health` - Server health check

See the Swagger documentation for detailed API specifications.

## Security Considerations

- Set appropriate rate limits for your deployment
- Consider implementing authentication for production use
- Monitor server resources and implement appropriate limits

## Configuration

The server can be configured via environment variables:

- `MARKITDOWN_API_HOST` - Host to bind to (default: 127.0.0.1)
- `MARKITDOWN_API_PORT` - Port to listen on (default: 8000)
- `MARKITDOWN_API_WORKERS` - Number of worker processes (default: 1)
- `MARKITDOWN_API_ENABLE_PLUGINS` - Enable MarkItDown plugins (default: false)
- `MARKITDOWN_API_RATE_LIMIT` - Requests per minute per IP (default: 60)

## Development

To run the server in development mode with hot reloading:

```bash
uvicorn markitdown_api:app --reload
```

## License

MIT
Loading