很早之前就申请了微信公众号,写了几篇就中断了,主要原因是微信提供的公众号不太适合写技术类的文章,比如添加代码片段或LATEX数学公式等。
针对 LATEX数学公式的支持我看反馈论坛上已经提了5~6年了就是没有实现,哎~
先前有一段时间通过复制粘贴的方式将Wordpress的几篇文章同步过去,但每次复制过去又会出现格式错乱,每次调整都要花很多时间。最近周末比较有时间,稍微研究了下公众号的接口,发现可以通过程序的方式将WordPress的文章处理后同步过去。具体处理流程如下:
- 使用Python获取Wordpress的文章
- 处理文章中的格式,将其转化为公众号接受的格式
- 去除无用的标签属性
- 针对<ul><ol>标签格式问题修复
- 修复表格呈现问题
- 修复H1~H6标签大小问题
- 提取文章中链接,将其放到文章最后,并删除文章中的超链接
- 获取文章中的图片,将图片上传到微信公众号,更新文章中的图片链接。
当前暂未处理的:将LATEX公式转化为图片,先前实现了采用base64格式的图片数据加载方式,测试下来微信公众号不支持,解决方案是将公式转换后的每个图片上传到素材库,原理上是可以实现,但是一篇文章中涉及到的公式太多,觉得不太适合。所以暂时先没有处理。
具体代码如下:(代码比较Ugly,但好在能用)
import requests import re from io import BytesIO from PIL import Image import urllib.parse import os from bs4 import BeautifulSoup import json from urllib.parse import urlparse import base64 import matplotlib.pyplot as plt wp_headers = { 'User-Agent': 'WordPress2Weixin' } # 获取接口TOKEN def _get_token(): app_id = '' app_secret = '' token_url = "https://api.weixin.qq.com/cgi-bin/token?grant_type=client_credential&appid={}&secret={}".format(app_id, app_secret) r = requests.get(token_url) print(r.json()) return r.json()["access_token"] # 上传图片的微信 def _upload_image(image_url): """ :param image_url: :return: {"media_id":"","url"} """ upload_url = "https://api.weixin.qq.com/cgi-bin/material/add_material?access_token={}&type=image".format( _get_token()) r_img = requests.get(image_url, stream=True, headers=wp_headers) image_file = BytesIO(r_img.content) image_file.seek(0) image = Image.open(image_file) image_type = image.format.lower() if image_type == 'webp': image = image.convert("RGBA") image_file = BytesIO() image.save(image_file, format='PNG') image_type = 'png' image_file.seek(0) filename = os.path.splitext(os.path.basename(urllib.parse.urlparse(image_url).path))[0] + '.' + image_type mime_type = 'image/' + image_type if image_type else 'application/octet-stream' files = {'media': (filename, image_file, mime_type)} headers = {"content-type": "multipart/form-data"} r_url = requests.post(upload_url, files=files, headers=headers) return r_url.json() # 获取封面图片 def _get_featured_media(media_id,parsed_url): media_url = "{}://{}/wp-json/wp/v2/media/{}".format(parsed_url.scheme,parsed_url.netloc,media_id) r = requests.get(media_url, headers=wp_headers) return _upload_image(r.json()["guid"]["rendered"]).get("media_id") # 替换文章中的图片 def _replace_image_urls(content): pattern = re.compile('src="(.*?)"') image_urls = re.findall(pattern, content) print(image_urls) new_image_links = [] for link in image_urls: new_link = _upload_image(link).get("url") new_image_links.append(new_link) for i in range(len(image_urls)): content = content.replace(image_urls[i], new_image_links[i]) return content # 修复微信公众号的格式 def _fix_tags(content): # 去除ez-toc生成的目录信息 pattern = re.compile('(<div id="ez-toc-container"[\S\s]*?</div>\n)') content = re.sub(pattern, '', content) soup = BeautifulSoup(content, 'html.parser') # 去除无用的标签属性 for tag in soup.find_all(True): for attr in ['class', 'id', 'style', 'data-enlighter-language', 'decoding', 'loading', 'alt']: del tag[attr] # 去除无效的span标签 for span in soup.find_all('span'): span.extract() # 修复列表呈现问题 list_tags = soup.find_all(['ul', 'ol']) for original_list_tag in list_tags: new_list_tag = soup.new_tag(original_list_tag.name) for li_tag in original_list_tag.find_all('li', recursive=False): new_list_tag.append(li_tag) original_list_tag.replace_with(new_list_tag) # 修复表格呈现 for table_tag in soup.find_all('table'): del table_tag['width'] table_tag.attrs['style'] = "width: 100%; overflow-x: auto; display: block;" # 修复H1~H6标签 font_sizes = { 'h1': '2.0em', 'h2': '1.8em', 'h3': '1.6em', 'h4': '1.4em', 'h5': '1.2em', 'h6': '1.0em', } for tag_name, font_size in font_sizes.items(): for tag in soup.find_all(tag_name): tag['style'] = f'font-size: {font_size};' # 获取文章中的链接,将链接放在文章最后 links = soup.find_all('a') links_div = soup.new_tag('div') p_tag = soup.new_tag('p') p_tag.string = "可以点击阅读原文查看正文相关链接" links_div.append(p_tag) counter = 1 for link in links: href = link.get("href") if href and not href.startswith("#") and not re.match(r'https?://', link.text): link_str = f'{counter}. {link.text} {href}' p_tag = soup.new_tag('p') p_tag.string = link_str links_div.append(p_tag) counter += 1 soup.append(links_div) # 删除文章中的链接 for a_tag in soup.find_all('a'): del a_tag['href'] return str(soup) # 定义函数,将 LaTeX 公式转换为图片 # def _latex_to_image(latex_str): # print(latex_str) # fig = plt.figure(figsize=(2, 0.5)) # fig.text(0, 0, f'${latex_str}$', fontsize=12, color='black') # buffer = BytesIO() # plt.savefig(buffer, format='png', bbox_inches='tight', pad_inches=0.0) # assert isinstance(fig, object) # plt.close(fig) # buffer.seek(0) # return buffer.read() # # # # 定义函数,将文章中的 LaTeX 公式转换为图片 # def _replace_latex_with_images(text): # inline_pattern = re.compile(r'$(.*?)$') # display_pattern = re.compile(r'$$(.*?)$$') # # def replace_func(match): # latex_str = match.group(1) # try: # img_data = _latex_to_image(latex_str) # encoded_img_data = base64.b64encode(img_data).decode() # return f'<img src="data:image/png;base64,{encoded_img_data}" />' # except: # return latex_str # # # 替换行间 LaTeX 公式 # text = display_pattern.sub(replace_func, text) # # 替换行内 LaTeX 公式 # text = inline_pattern.sub(replace_func, text) # # return text # 获取WordPress文章内容 def add_draft(post_url): r = requests.get(post_url, headers=wp_headers) pattern = re.compile('<div class="container site-content"><div id="post-(\d+)">') post_id = re.findall(pattern, r.text)[0] parsed_url = urlparse(post_url) post_url_json = "{}://{}/wp-json/wp/v2/posts/{}".format(parsed_url.scheme, parsed_url.netloc, post_id) r_json = requests.get(post_url_json, headers=wp_headers) post_raw = r_json.json() post_data = { "title": post_raw['title']['rendered'], "author": "biaodianfu", "content": _replace_image_urls(_fix_tags(post_raw['content']['rendered'])), "content_source_url": post_url, "thumb_media_id": _get_featured_media(post_raw['featured_media'], parsed_url), "need_open_comment": 0, "only_fans_can_comment": 0 } url = "https://api.weixin.qq.com/cgi-bin/draft/add?access_token={}".format(_get_token()) headers = {'Content-Type': 'application/json'} data = json.dumps({"articles": [post_data]}, ensure_ascii=False).encode('utf-8') print(data) response = requests.post(url, data=data, headers=headers) # Modify this line print(response.content) if response and "media_id" in response.json(): print("Draft created successfully.") else: print(f"Failed to create draft. Error response: {response}") if __name__ == "__main__": post_url = "" add_draft(post_url)